Skip to content

Commit 2e4a76b

Browse files
committed
Fix edge case odd punctuation issues
Adds logic to correct odd punctuation in translated returned text that result in issues like a line beginning with a comma.
1 parent 69ed9cd commit 2e4a76b

File tree

2 files changed

+46
-15
lines changed

2 files changed

+46
-15
lines changed

Scripts/translate.py

+45-14
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,15 @@
2828
# Add span tags around certain words to exclude them from being translated
2929
def add_notranslate_tags_from_notranslate_file(text, phraseList, customNoTranslateTag=None):
3030
for word in phraseList:
31-
findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{word}[.,!?()]?["\']?)(\p{{Z}}|$)' #\p ensures it works with unicode characters
31+
findWordRegex = rf'\b{word}\b'
3232
findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
33-
# Find the word, with optional punctuation after, and optional quotes before or after
33+
3434
if not customNoTranslateTag:
35-
text = findWordRegexCompiled.sub(r'\1<span class="notranslate">\2</span>\3', text)
35+
# Directly replace the word with a span tag
36+
text = findWordRegexCompiled.sub(rf'<span class="notranslate">{word}</span>', text)
3637
else:
37-
# Add custom XML tag
38-
text = findWordRegexCompiled.sub(rf'\1<{customNoTranslateTag}>\2</{customNoTranslateTag}>\3', text)
38+
# Replace the word with a custom XML tag
39+
text = findWordRegexCompiled.sub(rf'<{customNoTranslateTag}>{word}</{customNoTranslateTag}>', text)
3940
return text
4041

4142
def remove_notranslate_tags(text, customNoTranslateTag=None):
@@ -50,12 +51,16 @@ def add_notranslate_tags_for_manual_translations(text, langcode, customTag=None)
5051
# Only replace text if the language matches the entry in the manual translations file
5152
if manualTranslatedText['Language Code'] == langcode:
5253
originalText = manualTranslatedText['Original Text']
53-
findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{originalText}[.,!?()]?["\']?)(\p{{Z}}|$)'
54+
findWordRegex = rf'\b{originalText}\b'
5455
findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
55-
if customTag == None:
56-
text = findWordRegexCompiled.sub(r'\1<span class="notranslate">\2</span>\3', text)
56+
57+
if customTag is None:
58+
replacement = rf'<span class="notranslate">{originalText}</span>'
5759
else:
58-
text = findWordRegexCompiled.sub(rf'\1<{customTag}>\2</{customTag}>\3', text)
60+
replacement = rf'<{customTag}>{originalText}</{customTag}>'
61+
62+
text = findWordRegexCompiled.sub(replacement, text)
63+
5964
return text
6065

6166
# Replace certain words or phrases with their manual translation
@@ -65,10 +70,11 @@ def replace_manual_translations(text, langcode):
6570
if manualTranslatedText['Language Code'] == langcode:
6671
originalText = manualTranslatedText['Original Text']
6772
translatedText = manualTranslatedText['Translated Text']
68-
findWordRegex = rf'(\p{{Z}}|^)(["\'()]?{originalText}[.,!?()]?["\']?)(\p{{Z}}|$)'
73+
findWordRegex = rf'\b{originalText}\b'
6974
findWordRegexCompiled = regex.compile(findWordRegex, flags=re.IGNORECASE | re.UNICODE)
7075
# Substitute the matched word with the translated text
71-
text = findWordRegexCompiled.sub(rf'\1{translatedText}\3', text)
76+
text = findWordRegexCompiled.sub(translatedText, text)
77+
7278
return text
7379

7480

@@ -153,12 +159,37 @@ def add_marker_and_convert_to_string(textList, customMarkerTag):
153159
if i == len(textList) - 1:
154160
combinedString += text
155161
else:
156-
combinedString += text + f" {customMarkerTag} "
162+
combinedString += text + f" {customMarkerTag}"
157163
return combinedString
158164

159165
def split_and_clean_marked_combined_string(originalCombinedString, customMarkerTag, removeExtraAddedTag=None):
166+
# Fix issue where sometimes double commas or punctuation are added near tags
167+
punctuation = ",、.。" # Add more comma types if needed
168+
escapedPunctuationChars = re.escape(punctuation)
169+
doublePunctuationPattern = rf"([.{escapedPunctuationChars}]\s*(?:<[^>]+>\s*)*[.{escapedPunctuationChars}]?\s*{customMarkerTag}\s*)[.{escapedPunctuationChars}]"
170+
# Replace the entire match with the captured group (excluding the redundant period)
171+
fixedCombinedString = re.sub(doublePunctuationPattern, r'\1', originalCombinedString)
172+
173+
# Fix issue where a comma is placed after the marker tag, which causes comma to be at the beginning of a line
174+
fixMisplacedCommaPattern = rf"({customMarkerTag}\s?)([{escapedPunctuationChars}])"
175+
fixedCombinedString = re.sub(fixMisplacedCommaPattern, r"\2\1", fixedCombinedString)
176+
177+
# Fix issue where after a custom marker tag, an extra space is added between the next punctuation. This matches any ending html tag, then a space, then a punctuation character
178+
fixExtraSpaceAfterTagPattern = rf"(</[^>]+>)\s+([{escapedPunctuationChars}])"
179+
fixedCombinedString = re.sub(fixExtraSpaceAfterTagPattern, r"\1\2", fixedCombinedString)
180+
181+
# # Fix resulting comma appearing directly after period after correcting other issues
182+
# escaped_periods = re.escape(".。") # Standard and Japanese periods
183+
# escaped_commas = re.escape(",、") # Standard and Japanese commas
184+
# commaAfterPeriodPattern = rf"([{escaped_periods}])([{escaped_commas}])"
185+
# # Use re.sub to remove the comma in such cases (replace with just the period)
186+
# fixedCombinedString = re.sub(commaAfterPeriodPattern, r"\1", fixedCombinedString)
187+
188+
# Fix issue where hyphen is placed in addition to comma resulting in -,
189+
fixedCombinedString = fixedCombinedString.replace(f' -,', f',')
190+
160191
# Split the translated text into chunks based on the custom marker tags, and remove the tags
161-
textList = originalCombinedString.split(f'{customMarkerTag}')
192+
textList = fixedCombinedString.split(f'{customMarkerTag}')
162193
# Strip spaces off ends of lines, then remove tag, and strip spaces again to remove any leftover
163194
textList = [text.strip() for text in textList]
164195
textList = [text.replace(f'{customMarkerTag}', '') for text in textList]
@@ -230,7 +261,7 @@ def translate_with_deepl_and_process(textList, targetLanguage, formality=None, c
230261
# Handle weird quirk of DeepL where it adds parenthesis around the tag sometimes
231262
# Pattern to find parentheses around the custom tag with potential spaces. Also handles full width parenthesis
232263
pattern = r'[((]\s*<xxx>\s*[))]'
233-
translatedText = re.sub(pattern, ' <xxx> ', translatedText)
264+
translatedText = re.sub(pattern, ' <xxx>', translatedText)
234265

235266
# Split the translated text into chunks based on the custom marker tags, and remove the tags
236267
translatedTextsList = split_and_clean_marked_combined_string(translatedText, customMarkerTag='<xxx>')

main.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# License: GPLv3
77
# NOTE: By contributing to this project, you agree to the terms of the GPLv3 license, and agree to grant the project owner the right to also provide or sell this software, including your contribution, to anyone under any other license, with no compensation to you.
88

9-
version = '0.19.0'
9+
version = '0.20.0'
1010
print(f"------- 'Auto Synced Translated Dubs' script by ThioJoe - Release version {version} -------")
1111

1212
# Import other files

0 commit comments

Comments
 (0)