Merge pull request #168 from zuyu/regex-fix

Fixed regex SyntaxWarnings
This commit is contained in:
okmyworld
2024-06-30 17:16:55 +08:00
committed by GitHub
3 changed files with 41 additions and 43 deletions

View File

@@ -31,7 +31,7 @@ def get_parameter(request, param, default, cast_type):
# 数字转为英文读法
def num_to_english(num):
num_str = str(num)
# English representations for numbers 0-9
english_digits = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
@@ -41,29 +41,29 @@ def num_to_english(num):
need_and = False # Indicates whether 'and' needs to be added
part = [] # Stores each group of 4 digits
is_first_part = True # Indicates if it is the first part for not adding 'and' at the beginning
# Split the number into 3-digit groups
while num_str:
part.append(num_str[-3:])
num_str = num_str[:-3]
part.reverse()
for i, p in enumerate(part):
p_str = ""
digit_len = len(p)
if int(p) == 0 and i < len(part) - 1:
continue
hundreds_digit = int(p) // 100 if digit_len == 3 else None
tens_digit = int(p) % 100 if digit_len >= 2 else int(p[0] if digit_len == 1 else p[1])
# Process hundreds
if hundreds_digit is not None and hundreds_digit != 0:
p_str += english_digits[hundreds_digit] + " hundred"
if tens_digit != 0:
p_str += " and "
# Process tens and ones
if 10 < tens_digit < 20: # Teens exception
teen_map = {
@@ -79,17 +79,17 @@ def num_to_english(num):
p_str += tens_map[tens_val] + (" " + english_digits[ones_val] if ones_val != 0 else "")
elif tens_digit != 0 and tens_val < 2: # When tens_digit is in [1, 9]
p_str += english_digits[tens_digit]
if p_str and not is_first_part and need_and:
result += " and "
result += p_str
if i < len(part) - 1 and int(p) != 0:
result += " " + big_units[len(part) - i - 1] + ", "
is_first_part = False
if int(p) != 0:
need_and = True
return result.capitalize()
@@ -120,8 +120,8 @@ def num2text(text):
text = re.sub(r'((?:\d+\.)?\d+)\s*/\s*(\d+)', fraction_to_words, text)
# 取出数字 number_list= [('1000200030004000.123', '1000200030004000', '123'), ('23425', '23425', '')]
number_list=re.findall('((\d+)(?:\.(\d+))?%?)',text)
if len(number_list)>0:
number_list=re.findall(r'((\d+)(?:\.(\d+))?%?)', text)
if len(number_list)>0:
#dc= ('1000200030004000.123', '1000200030004000', '123','')
for m,dc in enumerate(number_list):
if len(dc[1])>16:
@@ -133,14 +133,14 @@ def num2text(text):
int_text=f' the pronunciation of {int_text}'
text=text.replace(dc[0],int_text)
return text.replace('1',' one ').replace('2',' two ').replace('3',' three ').replace('4',' four ').replace('5',' five ').replace('6',' six ').replace('7','seven').replace('8',' eight ').replace('9',' nine ').replace('0',' zero ').replace('=',' equals ')
# 中英文数字转换为文字,特殊符号处理
def split_text(text_list):
tx = TextNormalizer()
haserror=False
result=[]
@@ -177,12 +177,12 @@ def split_text_by_punctuation(text):
min_length = 150
punctuation_marks = "。?!,、;:”’》」』)】…—"
english_punctuation = ".?!,:;)}…"
# 结果列表
result = []
# 起始位置
pos = 0
# 遍历文本中的每个字符
text_length=len(text)
for i, char in enumerate(text):
@@ -196,11 +196,11 @@ def split_text_by_punctuation(text):
# 更新起始位置到当前标点的下一个字符
pos = i+1
#print(f'{pos=},{len(text)=}')
# 如果剩余文本长度超过120或没有更多标点符号可以进行分割将剩余的文本作为一个分段添加到结果列表
if pos < len(text):
result.append(text[pos:])
return result
@@ -223,17 +223,17 @@ def ClearWav(directory):
print(f"文件删除错误 {file_path}, 报错信息: {e}")
return False, str(e)
return True, "所有wav文件已被删除."
# 保存音色
# 保存音色
# 参考 https://github.com/craii/ChatTTS_WebUI/blob/main/utils.py
def save_speaker(name, tensor):
def save_speaker(name, tensor):
try:
df = pd.DataFrame({"speaker": [float(i) for i in tensor]})
df.to_csv(f"{SPEAKER_DIR}/{name}.csv", index=False, header=False)
except Exception as e:
print(e)
# 加载音色
# 参考 https://github.com/craii/ChatTTS_WebUI/blob/main/utils.py
def load_speaker(name):
@@ -311,4 +311,4 @@ def modelscope_status():
return False
except Exception as e:
return False
return True
return True

View File

@@ -58,7 +58,6 @@ def replace_temperature(match) -> str:
def replace_measure(sentence) -> str:
for q_notation in measure_dict:
if q_notation in sentence and re.search(f'\d{q_notation}',sentence):
if q_notation in sentence and re.search(r'\d{q_notation}', sentence):
sentence = sentence.replace(q_notation, measure_dict[q_notation])
return sentence

View File

@@ -60,7 +60,7 @@ class TextNormalizer():
text (str): The input text.
Returns:
List[str]: Sentences.
character_map = {
"": "",
"": "",
@@ -105,8 +105,8 @@ class TextNormalizer():
return sentences
def _post_replace(self, sentence: str) -> str:
#sentence = sentence.replace('/', '每')
sentence = sentence.replace('~', '')
sentence = sentence.replace('', '')
@@ -146,8 +146,8 @@ class TextNormalizer():
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
sentence = sentence.replace('+', '')
# re filter special characters, have one more character "-" than line 68
sentence = re.sub(r'[-——《》【】<=>{}()#&@“”^|…\\]', '', sentence)
return sentence
@@ -161,12 +161,12 @@ class TextNormalizer():
result = ""
zero_flag = False # 标记是否需要加'零'
part = [] # 存储每4位的数字
# 将数字按每4位分组
while num_str:
part.append(num_str[-4:])
num_str = num_str[:-4]
for i in range(len(part)):
part_str = ""
part_zero_flag = False
@@ -184,21 +184,21 @@ class TextNormalizer():
part_str = part_str[:-1] # 去除尾部的'零'
if part_str:
zero_flag = True
if i > 0 and not set(part[i]) <= {'0'}: # 如果当前部分不全是0则加上相应的大单位
result = part_str + big_units[i] + result
else:
result = part_str + result
# 处理输入为0的情况或者去掉开头的零
result = result.lstrip(chinese_digits[0])
if not result:
return chinese_digits[0]
return result
def normalize_sentence(self, sentence: str) -> str:
# basic character conversions
# add
sentence = re.sub(r'(\d+)\s*[\*xX]\s*(\d+)', r'\1 乘 \2', sentence,re.I)
@@ -207,12 +207,12 @@ class TextNormalizer():
sentence = re.sub(r'(0\d+)\-(\d{3,})', r'\1杠\2', sentence,re.I)
sentence = sentence.replace('=', '等于')
sentence = sentence.replace('÷','除以')
#sentence = re.sub(r'(\d+)\s*\-', r'\1 减', sentence)
sentence = re.sub(r'((?:\d+\.)?\d+)\s*/\s*(\d+)', r'\2分之\1', sentence)
# 取出数字 number_list= [('1000200030004000.123', '1000200030004000', '123'), ('23425', '23425', '')]
number_list=re.findall('((\d+)(?:\.(\d+))?%?)',sentence)
number_list=re.findall(r'((\d+)(?:\.(\d+))?%?)', sentence)
numtext=['','','','','','','','','','']
if len(number_list)>0:
#dc= ('1000200030004000.123', '1000200030004000', '123','')
@@ -227,8 +227,7 @@ class TextNormalizer():
if dc[0][-1]=='%':
int_text=f'百分之{int_text}'
sentence=sentence.replace(dc[0],int_text)
sentence = tranditional_to_simplified(sentence)
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
F2H_DIGITS).translate(F2H_SPACE)
@@ -258,7 +257,7 @@ class TextNormalizer():
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
sentence = RE_NUMBER.sub(replace_number, sentence)
sentence = self._post_replace(sentence)
sentence = sentence.replace('[一break]','[1break]')
return sentence