File size: 8,311 Bytes
23f8e38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import logging
logger = logging.getLogger(__name__)

import os
import re
from deep_translator import GoogleTranslator
from deep_translator import exceptions
from gematria import calculate_gematria
import math
import xml.etree.ElementTree as ET
import glob

# Hebrew letters and whitespace range
DIACRITICS_REGEX = re.compile(r"[^\u05D0-\u05EA\s]+")

def process_json_files(start, end, step, rounds="1", length=0,
                       tlang="en",
                       strip_spaces=True,
                       strip_in_braces=True,
                       strip_diacritics=True,
                       translate=False):
    base_path = "texts/tanach"
    translator = GoogleTranslator(source='auto', target=tlang)
    results = []

    for i in range(start, end + 1):
        file_pattern = f"{base_path}/{i:02}*.xml"
        matching_files = glob.glob(file_pattern)

        if not matching_files:
            results.append({"error": f"No file matching pattern '{file_pattern}' found."})
            continue

        for file_name in matching_files:
            try:
                tree = ET.parse(file_name)
                root = tree.getroot()

                # Step 1: Collect text by verse, each verse's words joined by spaces
                text_blocks = []
                for verse in root.findall('.//v'):
                    verse_words = []
                    for word in verse.findall('./w'):
                        verse_words.append("".join(word.itertext()))
                    # Now join words into a single verse string
                    verse_text = " ".join(verse_words)
                    text_blocks.append(verse_text)

                # Step 2: Join all verses with spaces to get a single string
                full_text = " ".join(text_blocks)

                # -- The rest of the cleaning logic remains as before --
                if strip_in_braces:
                    full_text = re.sub(r"\[.*?\]", "", full_text, flags=re.DOTALL)
                if strip_diacritics:
                    full_text = DIACRITICS_REGEX.sub("", full_text)
                if strip_spaces:
                    # Remove *all* spaces
                    full_text = full_text.replace(" ", "")
                else:
                    # Collapse multiple spaces into one
                    full_text = re.sub(r'\s+', ' ', full_text)

                text_length = len(full_text)
                if text_length == 0:
                    # If after cleaning, there's no text, skip
                    continue

                rounds_list = list(map(float, rounds.split(',')))  # Allow floats
                result_text = ""

                # -- Rounds logic (unchanged) --
                for r in rounds_list:
                    abs_r = abs(r)
                    full_passes = math.floor(abs_r)
                    remainder = abs_r - full_passes

                    base_chars = text_length // step

                    if base_chars == 0:
                        if abs_r > 1:
                            chars_per_full_pass = 1
                        else:
                            chars_per_full_pass = 0
                        chars_for_remainder = 0
                    else:
                        chars_per_full_pass = base_chars
                        chars_for_remainder = math.floor(base_chars * remainder)

                    if r > 0:
                        current_index = (step - 1) % text_length
                        direction = 1
                    else:
                        current_index = (text_length - step) % text_length
                        direction = -1

                    pass_result = ""
                    for pass_num in range(1, full_passes + 1):
                        current_pass_chars = ""
                        for _ in range(chars_per_full_pass):
                            if chars_per_full_pass == 0:
                                break
                            current_pass_chars += full_text[current_index]
                            current_index = (current_index + direction * step) % text_length
                        # Keep only the last full pass
                        if pass_num == full_passes:
                            pass_result = current_pass_chars

                    if remainder > 0 and chars_for_remainder > 0:
                        current_pass_chars = ""
                        for _ in range(chars_for_remainder):
                            current_pass_chars += full_text[current_index]
                            current_index = (current_index + direction * step) % text_length
                        pass_result += current_pass_chars

                    result_text += pass_result

                # Optional translation
                translated_text = ""
                if translate and result_text:
                    translated_text = translator.translate(result_text)

                if length != 0:
                    result_text = result_text[:length]

                # Append final if we have any picked text
                if result_text:
                    title_el = root.find('.//names/name')
                    title_str = title_el.text if title_el is not None else os.path.basename(file_name)
                    results.append({
                        "book": i,
                        "title": title_str,
                        "result_text": result_text,
                        "result_sum": calculate_gematria(result_text),
                        "translated_text": translated_text,
                        "source_language": "he",
                    })

            except FileNotFoundError:
                results.append({"error": f"File {file_name} not found."})
            except ET.ParseError as e:
                results.append({"error": f"File {file_name} could not be read as XML: {e}"})
            except KeyError as e:
                results.append({"error": f"Expected key 'text' is missing in {file_name}: {e}"})

    logger.debug(f"Returning results from torah.process_json_files: {results}")
    return results


# Tests
test_results = [
    (process_json_files(0, 0, 21, rounds="3", length=0), "ק"),
    (process_json_files(0, 0, 22, rounds="1", length=0), "ת"),
    (process_json_files(0, 0, 22, rounds="3", length=0), "ת"),
    (process_json_files(0, 0, 23, rounds="3", length=0), "ג"),
    (process_json_files(0, 0, 11, rounds="1", length=0), "כת"),
    (process_json_files(0, 0, 2, rounds="1", length=0), "בדוחילנעצרת"),
    (process_json_files(0, 0, 23, rounds="1", length=0), None),  # Expect None, when no results
    (process_json_files(0, 0, 23, rounds="-1", length=0), None),  # Expect None, when no results
    (process_json_files(0, 0, 22, rounds="-1", length=0), "א"),
    (process_json_files(0, 0, 22, rounds="-2", length=0), "א"),
    (process_json_files(0, 0, 1, rounds="1,-1", length=0), "אבגדהוזחטיכלמנסעפצקרשתתשרקצפעסנמלכיטחזוהדגבא"), # Combined rounds
    (process_json_files(0, 0, 1, rounds="-1", length=0), "תשרקצפעסנמלכיטחזוהדגבא"), # Reversed Hebrew alphabet
    (process_json_files(0, 0, 1, rounds="-1.5", length=0), "תשרקצפעסנמלכיטחזוהדגבאתשרקצפעסנמל"), # Fractional rounds
]

all_tests_passed = True
for result, expected in test_results:
    result_text = result[0]['result_text'] if result else None
    if expected is None:  # Check if no result is expected
        if not result:
            logger.warning(f"Test passed: Expected no results, got no results.")
        else:
            logger.error(f"Test failed: Expected no results, but got: {result_text}")
            all_tests_passed = False
    else:
        # Check if result is not empty before accessing elements
        if result:
            if result_text == expected:
                logger.warning(f"Test passed: Expected '{expected}', got '{result_text}'")
            else:
                logger.error(f"Test failed: Expected '{expected}', but got '{result_text}'")
                all_tests_passed = False
        else:
            logger.error(f"Test failed: Expected '{expected}', but got no results")
            all_tests_passed = False

if all_tests_passed:
    logger.info("All round tests passed.")