Add script to extract resource string, issue zufuliu#215, zufuliu#223.

RaiKoHoff · Jul 19, 2020 · df0bc7c · df0bc7c
1 parent f4951a1
commit df0bc7c
Show file tree

Hide file tree

Showing 2 changed files with 314 additions and 3 deletions.
diff --git a/locale/Locale.py b/locale/Locale.py
@@ -5,6 +5,7 @@
 import re
 import shutil
 import uuid
+import subprocess
 
 app = os.path.basename(__file__)
 localeDir = '.'
@@ -132,14 +133,318 @@ def copy_back_localized_resources(language):
     you can copy English resources back by run: {app} back en""")
 
 
+class StringExtractor:
+	def reset(self, path, reversion):
+		self.reversion = reversion
+		self.path = path
+		self.changed_lines = set()
+		if reversion:
+			self.find_changed_lines(reversion)
+			if not self.changed_lines:
+				return False
+
+		doc = open(path, encoding='utf-8', newline='\n').read()
+		self.lines = doc.splitlines()
+		return len(self.lines)
+
+	def find_changed_lines(self, reversion):
+		cmd = ['git', 'diff', '--no-color', '--unified=0', '--text', reversion, self.path]
+		result = subprocess.run(cmd, capture_output=True, encoding='utf-8')
+		if result.stderr:
+			print(result.stderr, file=sys.stderr)
+
+		diff = result.stdout
+		if not diff:
+			return
+
+		items = re.findall(r'^@@\s+\-\d+(,\d+)?\s+\+(\d+)(,\d+)?\s+@@', diff, re.MULTILINE)
+		for item in items:
+			_, line, count = item
+			line = int(line)
+			count = int(count[1:]) if count else 0
+			if count:
+				self.changed_lines.update(range(line, line + count))
+			else:
+				self.changed_lines.add(line)
+
+		# reversion time
+		cmd =['git', 'show', '--no-patch', '--no-notes', "--pretty='%ci'", reversion]
+		result = subprocess.run(cmd, capture_output=True, encoding='utf-8')
+		if result.stderr:
+			print(result.stderr, file=sys.stderr)
+		items = result.stdout.replace("'", '').split()[:2]
+		self.reversion = f"{reversion} {' '.join(items)}".strip()
+
+	def is_line_changed(self, start, end):
+		if not self.changed_lines:
+			return True
+
+		while start <= end:
+			if start in self.changed_lines:
+				return True
+			start += 1
+		return False
+
+	def match_line(self, line, word):
+		if line.startswith(word):
+			ch = ' ' if len(line) == len(word) else line[len(word)]
+			return ch.isspace() or ch == '/'
+		return False
+
+	def scan_string(self, line, escape_sequence, format_specifier, access_key, start):
+		index = 0
+		if start:
+			index = line.find('"')
+			if index <= 0:
+				return '', 0, 0, False
+
+		length = len(line)
+		begin = index
+		if start:
+			index += 1
+		stop = False
+
+		while index < length:
+			ch = line[index]
+			index += 1
+			if ch == '\\':
+				ch = line[index] if index < length else ''
+				end = index + 1
+				if ch and ch in 'abfnrtvxu':
+					if ch == 'x':
+						end += 2
+					elif ch == 'u':
+						end += 4
+					sequence = line[index - 1:end]
+					escape_sequence.add(sequence)
+				index = end
+			elif ch == '&':
+				if access_key != None:
+					ch = line[index]
+					if ch != '&' and not ch.isspace():
+						access_key.append(ch)
+			elif ch == '%':
+				ch = line[index]
+				# we only use '%s' in resource files
+				if ch == 's':
+					format_specifier.add('%' + ch)
+			elif ch == '"':
+				if index < length and line[index] == ch:
+					index += 1
+				else:
+					stop = True
+					break
+
+		value = line[begin:index]
+		return value, begin, index, stop
+
+	def build_hint(self, escape_sequence, format_specifier, access_key):
+		hint = ''
+		if access_key:
+			assert len(access_key) == 1
+			hint += ', access key: ' + access_key[0].upper()
+		if escape_sequence:
+			hint += ', escape sequence: ' + ', '.join(sorted(escape_sequence))
+		if format_specifier:
+			hint += ', format specifier: ' + ', '.join(sorted(format_specifier))
+		return hint
+
+	def parse_resource_item(self, lineno, line, block_items):
+		if not self.is_line_changed(lineno, lineno):
+			return None
+
+		escape_sequence = set()
+		format_specifier = set()
+		access_key = []
+		value, begin, index, _ = self.scan_string(line, escape_sequence, format_specifier, access_key, True)
+		if not any(ch.isalpha() for ch in value):
+			return None
+
+		# type "string", id
+		word = line[:begin].strip()
+		if not word.isidentifier():
+			return None
+
+		rcid = ''
+		begin = line.find(',', index)
+		if begin > 0:
+			index = line.find(',', begin + 1)
+			if index > 0:
+				rcid = line[begin + 1:index].strip()
+			else:
+				rcid = line[begin + 1:].strip()
+			assert rcid.isidentifier()
+
+		if word == 'CAPTION':
+			return f'{word} {value}'
+
+		comment = f'// {lineno} {word} {rcid}'.strip()
+		comment += self.build_hint(escape_sequence, format_specifier, access_key)
+		block_items.append({
+			'value': value,
+			'comment': comment
+		})
+		return None
+
+	def parse_string_table_item(self, lineno, line, block_items):
+		# id "multiline string"
+		escape_sequence = set()
+		format_specifier = set()
+		access_key = []
+		value, begin, index, stop = self.scan_string(line, escape_sequence, format_specifier, access_key, True)
+
+		rcid = line[:begin].strip()
+		assert rcid.isidentifier()
+
+		result = [value]
+		start = lineno
+		while not stop:
+			line = self.lines[lineno]
+			lineno += 1
+			value, begin, index, stop = self.scan_string(line, escape_sequence, format_specifier, access_key, False)
+			result.append(value)
+
+		if not self.is_line_changed(start, lineno):
+			return lineno
+
+		value = '\n'.join(result)
+		if not any(ch.isalpha() for ch in value):
+			return lineno
+
+		comment = f'// {start} {rcid}'
+		comment += self.build_hint(escape_sequence, format_specifier, access_key)
+		block_items.append({
+			'value': value,
+			'comment': comment
+		})
+		return lineno
+
+	def extract(self, path, reversion, out_path=None):
+		if not self.reset(path, reversion):
+			return
+
+		Block_None = 0
+		Block_Menu = 1
+		Block_DialogEx = 2
+		Block_StringTable = 3
+		Block_Ignore = 4
+
+		block_type = Block_None
+		block_name = ''
+		block_caption = ''
+		begin = 0
+		block_begin = 0
+		block_items = []
+
+		lineno = 0
+		line_count = len(self.lines)
+		string_list = []
+
+		while lineno < line_count:
+			line = self.lines[lineno]
+			line = line.strip()
+			lineno += 1
+			if not line or line.startswith('//') or line.startswith('#'):
+				continue
+
+			if block_type == Block_None:
+				begin = 0
+				if self.match_line(line, 'STRINGTABLE'):
+					block_type = Block_StringTable
+					block_name = 'STRINGTABLE'
+				else:
+					items = line.split()
+					if len(items) >= 2:
+						if items[1] == 'MENU':
+							block_type = Block_Menu
+							block_name = ' '.join(items[:2])
+						elif items[1] == 'DIALOGEX':
+							block_type = Block_DialogEx
+							block_name = ' '.join(items[:2])
+						elif items[1] in ('ACCELERATORS', 'DESIGNINFO', 'TEXTINCLUDE'):
+							block_type = Block_Ignore
+				if block_type != Block_None:
+					block_begin = lineno
+					block_items = []
+					block_caption = ''
+			elif self.match_line(line, 'BEGIN'):
+				begin += 1
+			elif self.match_line(line, 'END'):
+				begin -= 1
+				if begin <= 0:
+					block_type = Block_None
+					if block_items:
+						string_list.append({
+							'name': block_name,
+							'comment': f'// line {block_begin} - {lineno}',
+							'caption': block_caption,
+							'items': block_items
+						})
+						block_items = []
+			elif block_type != Block_Ignore:
+				try:
+					if block_type == Block_Menu or block_type == Block_DialogEx:
+						caption = self.parse_resource_item(lineno, line, block_items)
+						if caption:
+							block_caption = caption
+					elif block_type == Block_StringTable:
+						lineno = self.parse_string_table_item(lineno, line, block_items)
+				except Exception as ex:
+					print(f'parse {block_type} {block_name} fail at {lineno} for {self.path}')
+					raise
+
+		if string_list:
+			self.save(string_list, out_path)
+
+	def save(self, string_list, out_path=None):
+		if not out_path:
+			path, ext = os.path.splitext(self.path)
+			out_path = path + '-string' + ext
+
+		print('save:', out_path)
+		with open(out_path, 'w', encoding='utf-8') as fd:
+			fd.write("//! Ignore line starts with //, it's a comment line.\n")
+			fd.write("//! Please don't translate escape sequence or format specifiers.\n")
+			fd.write('\n')
+			if self.reversion:
+				fd.write("//! Updated strings since: " + self.reversion + '\n')
+			for block in string_list:
+				fd.write(block['comment'] + '\n')
+				fd.write(block['name'] + '\n')
+				if caption := block.get('caption', None):
+					fd.write(caption + '\n')
+				fd.write('BEGIN' + '\n')
+				for item in block['items']:
+					fd.write('\t' + item['comment'] + '\n')
+					fd.write('\t' + item['value'] + '\n')
+				fd.write('END' + '\n\n')
+
+def extract_resource_string(language, reversion):
+	print(f'{app}: extract updated string for {language} since {reversion}.')
+
+	extractor = StringExtractor()
+	if language == 'en':
+		extractor.extract(metapath_src, reversion)
+		extractor.extract(notepad2_src, reversion)
+	else:
+		folder = os.path.join(localeDir, language)
+		path = os.path.join(folder, 'metapath.rc')
+		extractor.extract(path, reversion)
+		path = os.path.join(folder, 'Notepad2.rc')
+		extractor.extract(path, reversion)
+
+
 def show_help():
-	print(f"""Usage: {app} action language
+	print(f"""Usage: {app} action language [reversion]
 action:
     new     create a new localization for specific language.
 
     back    prepare building standalone localized program for specific language,
             copy back localized resources to overwrite English resources.
-            English resources are copied into en folder when the folder does not exist.""")
+            English resources are copied into en folder when the folder does not exist.
+
+    string  extract all resource string or updated strings since specific reversion.
+""")
 
 def main():
 	if len(sys.argv) < 3:
@@ -159,6 +464,12 @@ def main():
 			print(f'{app}: language {language} not localized [{", ".join(availableLocales)}].');
 			return
 		copy_back_localized_resources(language)
+	elif action == 'string':
+		if language != 'en' and language not in availableLocales:
+			print(f'{app}: language {language} not localized [{", ".join(availableLocales)}].');
+			return
+		reversion = sys.argv[3] if len(sys.argv) > 3 else ''
+		extract_resource_string(language, reversion)
 	else:
 		show_help()
 

diff --git a/wiki b/wiki