28
28
# Add span tags around certain words to exclude them from being translated
29
29
def add_notranslate_tags_from_notranslate_file (text , phraseList , customNoTranslateTag = None ):
30
30
for word in phraseList :
31
- findWordRegex = rf'(\p{{Z}}|^)(["\'()]? { word } [.,!?()]?["\']?)(\p{{Z}}|$)' #\p ensures it works with unicode characters
31
+ findWordRegex = rf'\b { word } \b'
32
32
findWordRegexCompiled = regex .compile (findWordRegex , flags = re .IGNORECASE | re .UNICODE )
33
- # Find the word, with optional punctuation after, and optional quotes before or after
33
+
34
34
if not customNoTranslateTag :
35
- text = findWordRegexCompiled .sub (r'\1<span class="notranslate">\2</span>\3' , text )
35
+ # Directly replace the word with a span tag
36
+ text = findWordRegexCompiled .sub (rf'<span class="notranslate">{ word } </span>' , text )
36
37
else :
37
- # Add custom XML tag
38
- text = findWordRegexCompiled .sub (rf'\1 <{ customNoTranslateTag } >\2 </{ customNoTranslateTag } >\3 ' , text )
38
+ # Replace the word with a custom XML tag
39
+ text = findWordRegexCompiled .sub (rf'<{ customNoTranslateTag } >{ word } </{ customNoTranslateTag } >' , text )
39
40
return text
40
41
41
42
def remove_notranslate_tags (text , customNoTranslateTag = None ):
@@ -50,12 +51,16 @@ def add_notranslate_tags_for_manual_translations(text, langcode, customTag=None)
50
51
# Only replace text if the language matches the entry in the manual translations file
51
52
if manualTranslatedText ['Language Code' ] == langcode :
52
53
originalText = manualTranslatedText ['Original Text' ]
53
- findWordRegex = rf'(\p{{Z}}|^)(["\'()]? { originalText } [.,!?()]?["\']?)(\p{{Z}}|$) '
54
+ findWordRegex = rf'\b { originalText } \b '
54
55
findWordRegexCompiled = regex .compile (findWordRegex , flags = re .IGNORECASE | re .UNICODE )
55
- if customTag == None :
56
- text = findWordRegexCompiled .sub (r'\1<span class="notranslate">\2</span>\3' , text )
56
+
57
+ if customTag is None :
58
+ replacement = rf'<span class="notranslate">{ originalText } </span>'
57
59
else :
58
- text = findWordRegexCompiled .sub (rf'\1<{ customTag } >\2</{ customTag } >\3' , text )
60
+ replacement = rf'<{ customTag } >{ originalText } </{ customTag } >'
61
+
62
+ text = findWordRegexCompiled .sub (replacement , text )
63
+
59
64
return text
60
65
61
66
# Replace certain words or phrases with their manual translation
@@ -65,10 +70,11 @@ def replace_manual_translations(text, langcode):
65
70
if manualTranslatedText ['Language Code' ] == langcode :
66
71
originalText = manualTranslatedText ['Original Text' ]
67
72
translatedText = manualTranslatedText ['Translated Text' ]
68
- findWordRegex = rf'(\p{{Z}}|^)(["\'()]? { originalText } [.,!?()]?["\']?)(\p{{Z}}|$) '
73
+ findWordRegex = rf'\b { originalText } \b '
69
74
findWordRegexCompiled = regex .compile (findWordRegex , flags = re .IGNORECASE | re .UNICODE )
70
75
# Substitute the matched word with the translated text
71
- text = findWordRegexCompiled .sub (rf'\1{ translatedText } \3' , text )
76
+ text = findWordRegexCompiled .sub (translatedText , text )
77
+
72
78
return text
73
79
74
80
@@ -153,12 +159,37 @@ def add_marker_and_convert_to_string(textList, customMarkerTag):
153
159
if i == len (textList ) - 1 :
154
160
combinedString += text
155
161
else :
156
- combinedString += text + f" { customMarkerTag } "
162
+ combinedString += text + f" { customMarkerTag } "
157
163
return combinedString
158
164
159
165
def split_and_clean_marked_combined_string (originalCombinedString , customMarkerTag , removeExtraAddedTag = None ):
166
+ # Fix issue where sometimes double commas or punctuation are added near tags
167
+ punctuation = ",、.。" # Add more comma types if needed
168
+ escapedPunctuationChars = re .escape (punctuation )
169
+ doublePunctuationPattern = rf"([.{ escapedPunctuationChars } ]\s*(?:<[^>]+>\s*)*[.{ escapedPunctuationChars } ]?\s*{ customMarkerTag } \s*)[.{ escapedPunctuationChars } ]"
170
+ # Replace the entire match with the captured group (excluding the redundant period)
171
+ fixedCombinedString = re .sub (doublePunctuationPattern , r'\1' , originalCombinedString )
172
+
173
+ # Fix issue where a comma is placed after the marker tag, which causes comma to be at the beginning of a line
174
+ fixMisplacedCommaPattern = rf"({ customMarkerTag } \s?)([{ escapedPunctuationChars } ])"
175
+ fixedCombinedString = re .sub (fixMisplacedCommaPattern , r"\2\1" , fixedCombinedString )
176
+
177
+ # Fix issue where after a custom marker tag, an extra space is added between the next punctuation. This matches any ending html tag, then a space, then a punctuation character
178
+ fixExtraSpaceAfterTagPattern = rf"(</[^>]+>)\s+([{ escapedPunctuationChars } ])"
179
+ fixedCombinedString = re .sub (fixExtraSpaceAfterTagPattern , r"\1\2" , fixedCombinedString )
180
+
181
+ # # Fix resulting comma appearing directly after period after correcting other issues
182
+ # escaped_periods = re.escape(".。") # Standard and Japanese periods
183
+ # escaped_commas = re.escape(",、") # Standard and Japanese commas
184
+ # commaAfterPeriodPattern = rf"([{escaped_periods}])([{escaped_commas}])"
185
+ # # Use re.sub to remove the comma in such cases (replace with just the period)
186
+ # fixedCombinedString = re.sub(commaAfterPeriodPattern, r"\1", fixedCombinedString)
187
+
188
+ # Fix issue where hyphen is placed in addition to comma resulting in -,
189
+ fixedCombinedString = fixedCombinedString .replace (f' -,' , f',' )
190
+
160
191
# Split the translated text into chunks based on the custom marker tags, and remove the tags
161
- textList = originalCombinedString .split (f'{ customMarkerTag } ' )
192
+ textList = fixedCombinedString .split (f'{ customMarkerTag } ' )
162
193
# Strip spaces off ends of lines, then remove tag, and strip spaces again to remove any leftover
163
194
textList = [text .strip () for text in textList ]
164
195
textList = [text .replace (f'{ customMarkerTag } ' , '' ) for text in textList ]
@@ -230,7 +261,7 @@ def translate_with_deepl_and_process(textList, targetLanguage, formality=None, c
230
261
# Handle weird quirk of DeepL where it adds parenthesis around the tag sometimes
231
262
# Pattern to find parentheses around the custom tag with potential spaces. Also handles full width parenthesis
232
263
pattern = r'[((]\s*<xxx>\s*[))]'
233
- translatedText = re .sub (pattern , ' <xxx> ' , translatedText )
264
+ translatedText = re .sub (pattern , ' <xxx>' , translatedText )
234
265
235
266
# Split the translated text into chunks based on the custom marker tags, and remove the tags
236
267
translatedTextsList = split_and_clean_marked_combined_string (translatedText , customMarkerTag = '<xxx>' )
0 commit comments