tools/scripts/coll2icu.py - platform/external/cldr - Git at Google

 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 #
 # created on: 2013jun05
 # created by: Markus W. Scherer

 """Converts CLDR collation files from XML syntax to ICU syntax.

 Handles the CLDR collation data in the post-CLDR 23 trunk in 2013 June.
 Preserves indentation (except where it joins lines) and text vs. NCR etc.
 Does not handle arbitrary LDML XML collation syntax."""

 # Invoke with two arguments:
 # - the source folder path
 # - the destination folder path
 # For example:
 # ~/svn.cldr$ collicu/tools/scripts/coll2icu.py trunk/common/collation collicu/common/collation

 import codecs
 import glob
 import os.path
 import sys

 def GetIndent(s):
   for i in range(len(s)):
     if s[i] not in " \t": return s[:i]
   return s


 # substring replacements
 replacements = (
   # White space and syntax characters must be quoted.
   # Using '\\u0020' rather than just ' ' for clarity.
   ("<reset> </reset>", "&'\\u0020'"),  # can't just replace all "> <"
   (">!<", ">'!'<"),
   ('>"<', ">'\\\"'<"),
   (">&quot;<", ">'\\\"'<"),
   (">#<", ">'\\u0023'<"),
   (">$<", ">'$'<"),
   (">%<", ">'%'<"),
   (">&<", ">'&'<"),
   (">&amp;<", ">'&'<"),
   (">'<", ">''<"),
   (">&apos;<", ">''<"),
   (">(<", ">'('<"),
   (">)<", ">')'<"),
   (">*<", ">'*'<"),
   (">+<", ">'+'<"),
   (">,<", ">','<"),
   (">-<", ">'-'<"),
   (">.<", ">'.'<"),
   (">/<", ">'/'<"),
   (">:<", ">':'<"),
   (">;<", ">';'<"),
   (">&lt;<", ">'<'<"),
   (">=<", ">'='<"),
   (">&gt;<", ">'>'<"),
   (">?<", ">'?'<"),
   (">@<", ">'@'<"),
   (">[<", ">'['<"),
   (">\\<", ">'\\\\'<"),
   (">]<", ">']'<"),
   (">^<", ">'^'<"),
   (">_<", ">'_'<"),
   (">`<", ">'`'<"),
   (">{<", ">'{'<"),
   (">|<", ">'|'<"),
   (">}<", ">'}'<"),
   (">~<", ">'~'<"),
   # ha.xml has the following
   ("'y", "''y"),
   ("'Y", "''Y"),
   # kl.xml has the following
   ("K'", "K''"),
   # not Pattern_White_Space, just obscure
   (u"\u00A0", u"\\u00A0"),
   (u"\u200C", u"\\u200C"),
   (u"\u200D", u"\\u200D"),
   (u"\u3000", u"\\u3000"),
   # obscure, and some tools do not handle noncharacters well
   (u"\uFDD0", u"'\\uFDD0'"),
   # The old ICU collation rule parser seems to need more escaping than it should.
   (u"≠", u"'≠'"),
   # fi.xml resets contain a space
   (u" ̵</reset>", u"'\\u0020'̵"),
   # fa.xml <sc> with non-NFD_Inert chars
   (u"<sc>\u0650\u064f\u064b\u064d\u064c</sc>", u"<<\u0650<<\u064f<<\u064b<<\u064d<<\u064c"),
   # ml.xml strings contain spaces
   (u" </s>", u"'\\u0020'"),
   (u" </reset>", u"'\\u0020'"),
   # vi.xml <sc> with non-NFD_Inert chars
   (u"<sc>\u0309\u0303\u0301\u0323</sc>", u"<<\u0309<<\u0303<<\u0301<<\u0323"),
   # en_US_POSIX needs a lot of quoting.
   ("<pc>&#x20;&#x21;&#x22;&#x23;&#x24;&#x25;&#x26;&#x27;&#x28;&#x29;&#x2a;&#x2b;&#x2c;&#x2d;&#x2e;&#x2f;</pc>", "<*'\\u0020'-'/'"),
   ("<pc>0123456789&#x3a;&#x3b;&#x3c;&#x3d;&#x3e;&#x3f;&#x40;</pc>", "<*0-'@'"),
   ("<pc>&#x5b;&#x5c;&#x5d;&#x5e;&#x5f;&#x60;</pc>", "<*'['-'`'"),
   ("<pc>&#x7b;&#x7c;&#x7d;&#x7e;&#x7f;</pc>", "<*'{'-'\u007F'"),
   # CJK parenthesized resets
   ("<reset>(", "&'('"),
   (")</reset>", "')'"),
   # Convert XML elements into ICU syntax.
   ("><!--", "> #"),  # add a space before an inline comment
   ("<!--", "#"),
   (" -->", ""),
   ("-->", ""),
   ("<reset>", "&"),
   ('<reset before="primary">', "&[before 1]"),
   ('<reset before="secondary">', "&[before 2]"),
   ('<reset before="tertiary">', "&[before 3]"),
   ("</reset>", ""),
   ("<p>", "<"),
   ("</p>", ""),
   ("<s>", "<<"),
   ("</s>", ""),
   ("<t>", "<<<"),
   ("</t>", ""),
   ("<i>", "="),
   ("</i>", ""),
   ("<pc>", "<*"),
   ("</pc>", ""),
   ("<sc>", "<<*"),
   ("</sc>", ""),
   ("<tc>", "<<<*"),
   ("</tc>", ""),
   ("<ic>", "=*"),
   ("</ic>", ""),
   ("<x>", ""),
   ("</x>", ""),
   ("<extend>", "/"),
   ("</extend>", ""),
   ("</context>", "|"),
   ("<first_tertiary_ignorable/>", "[first tertiary ignorable]"),
   ("<last_tertiary_ignorable/>", "[last tertiary ignorable]"),
   ("<first_secondary_ignorable/>", "[first secondary ignorable]"),
   ("<last_secondary_ignorable/>", "[last secondary ignorable]"),
   ("<first_primary_ignorable/>", "[first primary ignorable]"),
   ("<last_primary_ignorable/>", "[last primary ignorable]"),
   ("<first_variable/>", "[first variable]"),
   ("<last_variable/>", "[last variable]"),
   ("<first_non_ignorable/>", "[first regular]"),
   ("<last_non_ignorable/>", "[last regular]"),
   ("<last_non_ignorable />", "[last regular]"),
   ("<first_trailing/>", "[first trailing]"),
   ("<last_trailing/>", "[last trailing]")
 )


 def ConvertFile(src, dest):
   in_rules = False
   partial = ""
   in_ml_comment = False
   for line in src:
     if "<rules>" in line:
       indent = GetIndent(line)
       stripped = line.strip()
       # Replace import-only rules with import elements.
       if stripped == '<rules><import source="sr"/></rules>':
         dest.write(indent + '<import source="sr"/>\n')
       elif stripped == '<rules><import source="hr" type="search"/></rules>':
         dest.write(indent + '<import source="hr" type="search"/>\n')
       elif stripped == '<rules><import source="hr"/></rules>':
         dest.write(indent + '<import source="hr"/>\n')
       elif stripped == '<rules><import source="ps"/></rules>':
         dest.write(indent + '<import source="ps"/>\n')
       else:
         # Replace the XML <rules> section with ICU syntax rules in <cr>.
         assert stripped == "<rules>"
         dest.write(indent + "<cr><![CDATA[\n")
         in_rules = True
     elif "</rules>" in line:
       # Flush, and go back to just copying lines until the next <rules>.
       if partial:
         dest.write(partial + "\n")
         partial = ""
       in_ml_comment = False
       dest.write(GetIndent(line) + "]]></cr>\n")
       in_rules = False
     else:
       if in_rules:
         # Find out whether we want to concatenate the current line
         # with the previous and/or next one.
         finish_partial = False  # Finish collected, partial input.
         start_ml_comment = False  # Start of a multi-line comment.
         stop_comment = False  # End of a comment, must terminate the line.
         if ("<reset" in line) or line.lstrip().startswith("<!--"):
           finish_partial = True
         if partial and len(partial.strip()) > 80:
           finish_partial = True
         if "<!--" in line and "-->" not in line:
           start_ml_comment = True
         if "-->" in line:
           assert line.rstrip().endswith("-->")
           stop_comment = True

         # Convert XML syntax to ICU syntax.
         if "<context>" in line:
           # Swap context & relation:
           #   <x><context>カ</context><i>ー</i></x>
           # turns into
           #   =カ|ー
           if "<i>" in line:
             line = line.replace("<i>", "").replace("<context>", "<i>")
           elif "<t>" in line:
             line = line.replace("<t>", "").replace("<context>", "<t>")

         for (xml, icu) in replacements:
           line = line.replace(xml, icu)

         while True:
           # Convert a Numeric Character Reference to \\uhhhh.
           i = line.find("&#x")
           if i < 0: break
           limit = line.find(";", i + 3)
           cp = line[i + 3:limit]
           while len(cp) < 4: cp = "0" + cp
           assert len(cp) == 4  # not handling supplementary code points
           line = line[:i] + "\\u" + cp + line[limit + 1:]

         # Start/continue/finish concatenation, and output.
         if partial and finish_partial:
           # Write collected input.
           dest.write(partial + "\n")
           partial = ""

         if start_ml_comment:
           # Start a multi-line comment.
           assert not partial
           comment_indent = GetIndent(line)  # can be the empty string
           in_ml_comment = True
         elif in_ml_comment:
           # Continue a multi-line comment.
           assert not partial
           if line.startswith(comment_indent):
             if line[len(comment_indent)] in " \t":
               # Preserve further indentation.
               line = comment_indent + "#" + line[len(comment_indent):]
             else:
               # Add a space after the #.
               line = comment_indent + "# " + line[len(comment_indent):]
           else:
             # Indent at least as much as the first line.
             line = line.lstrip()
             if line:
               line = comment_indent + "# " + line
             else:
               line = comment_indent + "#\n"
         elif stop_comment:
           # Just output the line, do not start collecting input.
           # ICU-syntax comments end with the end of the line,
           # do not append rules to them.
           if partial:
             line = partial + line.lstrip() + "\n"
             partial = ""
         elif not partial:
           # Start collecting input.
           partial = line.rstrip()
         elif partial:
           # Continue collecting input.
           partial += line.strip()

         if stop_comment:
           in_ml_comment = False
       if not partial: dest.write(line)


 def main():
   (src_root, dest_root) = sys.argv[1:3]
   src_pattern = os.path.join(src_root, "*.xml")
   for src_path in glob.iglob(src_pattern):
     basename = os.path.basename(src_path)
     dest_path = os.path.join(dest_root, basename)
     with codecs.open(src_path, "r", "UTF-8") as src:
       with codecs.open(dest_path, "w", "UTF-8") as dest:
         ConvertFile(src, dest)


 if __name__ == "__main__":
   main()
	#!/usr/bin/python
	# -- coding: utf-8 --
	#
	# created on: 2013jun05
	# created by: Markus W. Scherer

	"""Converts CLDR collation files from XML syntax to ICU syntax.

	Handles the CLDR collation data in the post-CLDR 23 trunk in 2013 June.
	Preserves indentation (except where it joins lines) and text vs. NCR etc.
	Does not handle arbitrary LDML XML collation syntax."""

	# Invoke with two arguments:
	# - the source folder path
	# - the destination folder path
	# For example:
	# ~/svn.cldr$ collicu/tools/scripts/coll2icu.py trunk/common/collation collicu/common/collation

	import codecs
	import glob
	import os.path
	import sys

	def GetIndent(s):
	for i in range(len(s)):
	if s[i] not in " \t": return s[:i]
	return s


	# substring replacements
	replacements = (
	# White space and syntax characters must be quoted.
	# Using '\\u0020' rather than just ' ' for clarity.
	("<reset> </reset>", "&'\\u0020'"), # can't just replace all "> <"
	(">!<", ">'!'<"),
	('>"<', ">'\\\"'<"),
	(">"<", ">'\\\"'<"),
	(">#<", ">'\\u0023'<"),
	(">$<", ">'$'<"),
	(">%<", ">'%'<"),
	(">&<", ">'&'<"),
	(">&<", ">'&'<"),
	(">'<", ">''<"),
	(">'<", ">''<"),
	(">(<", ">'('<"),
	(">)<", ">')'<"),
	("><", ">''<"),
	(">+<", ">'+'<"),
	(">,<", ">','<"),
	(">-<", ">'-'<"),
	(">.<", ">'.'<"),
	(">/<", ">'/'<"),
	(">:<", ">':'<"),
	(">;<", ">';'<"),
	("><<", ">'<'<"),
	(">=<", ">'='<"),
	(">><", ">'>'<"),
	(">?<", ">'?'<"),
	(">@<", ">'@'<"),
	(">[<", ">'['<"),
	(">\\<", ">'\\\\'<"),
	(">]<", ">']'<"),
	(">^<", ">'^'<"),
	(">_<", ">'_'<"),
	(">`<", ">'`'<"),
	(">{<", ">'{'<"),
	(">\|<", ">'\|'<"),
	(">}<", ">'}'<"),
	(">~<", ">'~'<"),
	# ha.xml has the following
	("'y", "''y"),
	("'Y", "''Y"),
	# kl.xml has the following
	("K'", "K''"),
	# not Pattern_White_Space, just obscure
	(u"\u00A0", u"\\u00A0"),
	(u"\u200C", u"\\u200C"),
	(u"\u200D", u"\\u200D"),
	(u"\u3000", u"\\u3000"),
	# obscure, and some tools do not handle noncharacters well
	(u"\uFDD0", u"'\\uFDD0'"),
	# The old ICU collation rule parser seems to need more escaping than it should.
	(u"≠", u"'≠'"),
	# fi.xml resets contain a space
	(u" ̵</reset>", u"'\\u0020'̵"),
	# fa.xml <sc> with non-NFD_Inert chars
	(u"<sc>\u0650\u064f\u064b\u064d\u064c</sc>", u"<<\u0650<<\u064f<<\u064b<<\u064d<<\u064c"),
	# ml.xml strings contain spaces
	(u" </s>", u"'\\u0020'"),
	(u" </reset>", u"'\\u0020'"),
	# vi.xml <sc> with non-NFD_Inert chars
	(u"<sc>\u0309\u0303\u0301\u0323</sc>", u"<<\u0309<<\u0303<<\u0301<<\u0323"),
	# en_US_POSIX needs a lot of quoting.
	("<pc> !"#$%&'()+,-./</pc>", "<'\\u0020'-'/'"),
	("<pc>0123456789:;<=>?@</pc>", "<*0-'@'"),
	("<pc>[\]^_`</pc>", "<*'['-'`'"),
	("<pc>{\|}~</pc>", "<*'{'-'\u007F'"),
	# CJK parenthesized resets
	("<reset>(", "&'('"),
	(")</reset>", "')'"),
	# Convert XML elements into ICU syntax.
	("><!--", "> #"), # add a space before an inline comment
	("<!--", "#"),
	(" -->", ""),
	("-->", ""),
	("<reset>", "&"),
	('<reset before="primary">', "&[before 1]"),
	('<reset before="secondary">', "&[before 2]"),
	('<reset before="tertiary">', "&[before 3]"),
	("</reset>", ""),
	("<p>", "<"),
	("</p>", ""),
	("<s>", "<<"),
	("</s>", ""),
	("<t>", "<<<"),
	("</t>", ""),
	("<i>", "="),
	("</i>", ""),
	("<pc>", "<*"),
	("</pc>", ""),
	("<sc>", "<<*"),
	("</sc>", ""),
	("<tc>", "<<<*"),
	("</tc>", ""),
	("<ic>", "=*"),
	("</ic>", ""),
	("<x>", ""),
	("</x>", ""),
	("<extend>", "/"),
	("</extend>", ""),
	("</context>", "\|"),
	("<first_tertiary_ignorable/>", "[first tertiary ignorable]"),
	("<last_tertiary_ignorable/>", "[last tertiary ignorable]"),
	("<first_secondary_ignorable/>", "[first secondary ignorable]"),
	("<last_secondary_ignorable/>", "[last secondary ignorable]"),
	("<first_primary_ignorable/>", "[first primary ignorable]"),
	("<last_primary_ignorable/>", "[last primary ignorable]"),
	("<first_variable/>", "[first variable]"),
	("<last_variable/>", "[last variable]"),
	("<first_non_ignorable/>", "[first regular]"),
	("<last_non_ignorable/>", "[last regular]"),
	("<last_non_ignorable />", "[last regular]"),
	("<first_trailing/>", "[first trailing]"),
	("<last_trailing/>", "[last trailing]")
	)


	def ConvertFile(src, dest):
	in_rules = False
	partial = ""
	in_ml_comment = False
	for line in src:
	if "<rules>" in line:
	indent = GetIndent(line)
	stripped = line.strip()
	# Replace import-only rules with import elements.
	if stripped == '<rules><import source="sr"/></rules>':
	dest.write(indent + '<import source="sr"/>\n')
	elif stripped == '<rules><import source="hr" type="search"/></rules>':
	dest.write(indent + '<import source="hr" type="search"/>\n')
	elif stripped == '<rules><import source="hr"/></rules>':
	dest.write(indent + '<import source="hr"/>\n')
	elif stripped == '<rules><import source="ps"/></rules>':
	dest.write(indent + '<import source="ps"/>\n')
	else:
	# Replace the XML <rules> section with ICU syntax rules in <cr>.
	assert stripped == "<rules>"
	dest.write(indent + "<cr><![CDATA[\n")
	in_rules = True
	elif "</rules>" in line:
	# Flush, and go back to just copying lines until the next <rules>.
	if partial:
	dest.write(partial + "\n")
	partial = ""
	in_ml_comment = False
	dest.write(GetIndent(line) + "]]></cr>\n")
	in_rules = False
	else:
	if in_rules:
	# Find out whether we want to concatenate the current line
	# with the previous and/or next one.
	finish_partial = False # Finish collected, partial input.
	start_ml_comment = False # Start of a multi-line comment.
	stop_comment = False # End of a comment, must terminate the line.
	if ("<reset" in line) or line.lstrip().startswith("<!--"):
	finish_partial = True
	if partial and len(partial.strip()) > 80:
	finish_partial = True
	if "<!--" in line and "-->" not in line:
	start_ml_comment = True
	if "-->" in line:
	assert line.rstrip().endswith("-->")
	stop_comment = True

	# Convert XML syntax to ICU syntax.
	if "<context>" in line:
	# Swap context & relation:
	# <x><context>カ</context><i>ー</i></x>
	# turns into
	# =カ\|ー
	if "<i>" in line:
	line = line.replace("<i>", "").replace("<context>", "<i>")
	elif "<t>" in line:
	line = line.replace("<t>", "").replace("<context>", "<t>")

	for (xml, icu) in replacements:
	line = line.replace(xml, icu)

	while True:
	# Convert a Numeric Character Reference to \\uhhhh.
	i = line.find("&#x")
	if i < 0: break
	limit = line.find(";", i + 3)
	cp = line[i + 3:limit]
	while len(cp) < 4: cp = "0" + cp
	assert len(cp) == 4 # not handling supplementary code points
	line = line[:i] + "\\u" + cp + line[limit + 1:]

	# Start/continue/finish concatenation, and output.
	if partial and finish_partial:
	# Write collected input.
	dest.write(partial + "\n")
	partial = ""

	if start_ml_comment:
	# Start a multi-line comment.
	assert not partial
	comment_indent = GetIndent(line) # can be the empty string
	in_ml_comment = True
	elif in_ml_comment:
	# Continue a multi-line comment.
	assert not partial
	if line.startswith(comment_indent):
	if line[len(comment_indent)] in " \t":
	# Preserve further indentation.
	line = comment_indent + "#" + line[len(comment_indent):]
	else:
	# Add a space after the #.
	line = comment_indent + "# " + line[len(comment_indent):]
	else:
	# Indent at least as much as the first line.
	line = line.lstrip()
	if line:
	line = comment_indent + "# " + line
	else:
	line = comment_indent + "#\n"
	elif stop_comment:
	# Just output the line, do not start collecting input.
	# ICU-syntax comments end with the end of the line,
	# do not append rules to them.
	if partial:
	line = partial + line.lstrip() + "\n"
	partial = ""
	elif not partial:
	# Start collecting input.
	partial = line.rstrip()
	elif partial:
	# Continue collecting input.
	partial += line.strip()

	if stop_comment:
	in_ml_comment = False
	if not partial: dest.write(line)


	def main():
	(src_root, dest_root) = sys.argv[1:3]
	src_pattern = os.path.join(src_root, "*.xml")
	for src_path in glob.iglob(src_pattern):
	basename = os.path.basename(src_path)
	dest_path = os.path.join(dest_root, basename)
	with codecs.open(src_path, "r", "UTF-8") as src:
	with codecs.open(dest_path, "w", "UTF-8") as dest:
	ConvertFile(src, dest)


	if __name__ == "__main__":
	main()