tools/python/src/cldr/refmirror.py - platform/external/cldr - Git at Google

 # (c) 2007-2010 IBM Corporation and Others. All Rights Reserved.
 # Python module for scanning and mirroring CLDR references.
 #
 # Steven R. Loomis. Oct 30th, 2007
 #
 #
 # usage:  refmirror.pl  /path/to/cldr/common   /path/to/nonexistent/refmirror-output-dir
 #
 # note:
 #  - does condense duplicate URLs within a locale, to only download once
 #      (should condense globally.)
 #  - requires 'wget' installed.
 #
 # todo:
 #  - only handles <references> formats - so CLDR 1.5 main/ but NOT collation/
 #  - doesn't escape UTF-8 URLs such as wikipedia ( writes out url in utf-8, does not %-encode )
 #  - should probably pass "-n 2" or such to wget to shorten hang time
 #

 from xml.dom import minidom
 import sys
 import os
 import codecs

 progname = sys.argv[0]

 if len(sys.argv) != 3:
     raise RuntimeError, "Usage: %s  <cldrroot> <output dir>"%(sys.argv[0])

 cldrdir = sys.argv[1]
 htmldir = sys.argv[2]

 print "# creating %s (shouldn't exist)" % htmldir
 os.mkdir(htmldir)

 print "# walking %s" % cldrdir

 dirs = os.walk(cldrdir)

 for dir in dirs:
     name = dir[0]
     subdirs = dir[1]
     files = dir[2]
     if(name.endswith("/CVS")):
         continue
     leaf=name[len(cldrdir):]
     if(leaf.startswith('/')):
         leaf=leaf[1:]
     print "dir: %s" % str(leaf)
     out = "%s/%s" % (htmldir,leaf)
     if(len(leaf)>0):
         os.mkdir(out)
     for file in files:
         if not file.endswith('.xml'):
             continue

         # hash of already read items
         alreadyread = {}

         # stub?
         stub = file
         xmldir = "%s/%s" % (out,file)

         # read file
         filepath = "%s/%s"%(name,file)
         dom = minidom.parse(filepath)

         nodes = dom.childNodes

         if not (nodes[1].nodeType == 1):
             continue
         refNode = nodes[1].getElementsByTagName('references')

         if not refNode:
             #print "## no refnode %s" % filepath
             continue
         print "## got refnode %s" % filepath
         os.mkdir(xmldir)
         for ref in refNode[0].getElementsByTagName('reference'):
             #print "## - ref %s" % str(ref)
             if not ref.hasAttribute('type'):
                 print "## untyped reference in %s" % filepath
             else:
                 type = ref.getAttribute('type')

                 if ref.hasAttribute('alt'):
                     type = "%s-%s" % (type,ref.getAttribute('alt'))

                 typedir = "%s/%s" % (xmldir,type)


                 if not ref.hasAttribute('uri'):
                     #print "# No 'uri' attribute on %s / %s"%(file,type)
                     continue
                 uri = ref.getAttribute('uri')
                 if uri.startswith('urn:'):
                     uri = uri[len('urn:'):]
                 if uri.startswith('isbn'):
                     # assume ISBN can fend for itself
                     continue
                 if uri.startswith('ISBN'):
                     # assume ISBN can fend for itself
                     continue
                 if not uri.startswith('http'):
                     print "# Not a known scheme: %s on %s / %s"%(uri,file,type)
                     continue;
                 #print uri
                 # write the Info file
                 file = open("%s.xml"%typedir, 'w')
                 file.write( codecs.BOM_UTF8 )
                 file.write(ref.toxml().encode( "utf-8" ))
                 file.close()
                 # make the dir..
                 os.mkdir(typedir)

                 # already read it?
                 if uri in alreadyread.keys():
                     already = alreadyread[uri]
                     alfile = open("%s/duplicate.txt"%typedir,'w')
                     alfile.write( ("%s\n"%already).encode("utf-8"))
                     alfile.close()
                 else:
                     alreadyread[uri] = type
                     cmd = "wget -P '%s' -nd -np -k -p '%s' 2>&1 > %s.err"%(typedir,uri,typedir)
                     print cmd.encode("utf-8")
                     try:
                         os.system(cmd)
                     except Exception,e:
                         exfile = open("%s.exc"%typedir,'w')
                         exfile.write( ("exception: %s\n"%str(e)).encode("utf-8"))
                         exfile.close()
                         print "%s - exception %s"%(typedir,str(e))
	# (c) 2007-2010 IBM Corporation and Others. All Rights Reserved.
	# Python module for scanning and mirroring CLDR references.
	#
	# Steven R. Loomis. Oct 30th, 2007
	#
	#
	# usage: refmirror.pl /path/to/cldr/common /path/to/nonexistent/refmirror-output-dir
	#
	# note:
	# - does condense duplicate URLs within a locale, to only download once
	# (should condense globally.)
	# - requires 'wget' installed.
	#
	# todo:
	# - only handles <references> formats - so CLDR 1.5 main/ but NOT collation/
	# - doesn't escape UTF-8 URLs such as wikipedia ( writes out url in utf-8, does not %-encode )
	# - should probably pass "-n 2" or such to wget to shorten hang time
	#

	from xml.dom import minidom
	import sys
	import os
	import codecs

	progname = sys.argv[0]

	if len(sys.argv) != 3:
	raise RuntimeError, "Usage: %s <cldrroot> <output dir>"%(sys.argv[0])

	cldrdir = sys.argv[1]
	htmldir = sys.argv[2]

	print "# creating %s (shouldn't exist)" % htmldir
	os.mkdir(htmldir)

	print "# walking %s" % cldrdir

	dirs = os.walk(cldrdir)

	for dir in dirs:
	name = dir[0]
	subdirs = dir[1]
	files = dir[2]
	if(name.endswith("/CVS")):
	continue
	leaf=name[len(cldrdir):]
	if(leaf.startswith('/')):
	leaf=leaf[1:]
	print "dir: %s" % str(leaf)
	out = "%s/%s" % (htmldir,leaf)
	if(len(leaf)>0):
	os.mkdir(out)
	for file in files:
	if not file.endswith('.xml'):
	continue

	# hash of already read items
	alreadyread = {}

	# stub?
	stub = file
	xmldir = "%s/%s" % (out,file)

	# read file
	filepath = "%s/%s"%(name,file)
	dom = minidom.parse(filepath)

	nodes = dom.childNodes

	if not (nodes[1].nodeType == 1):
	continue
	refNode = nodes[1].getElementsByTagName('references')

	if not refNode:
	#print "## no refnode %s" % filepath
	continue
	print "## got refnode %s" % filepath
	os.mkdir(xmldir)
	for ref in refNode[0].getElementsByTagName('reference'):
	#print "## - ref %s" % str(ref)
	if not ref.hasAttribute('type'):
	print "## untyped reference in %s" % filepath
	else:
	type = ref.getAttribute('type')

	if ref.hasAttribute('alt'):
	type = "%s-%s" % (type,ref.getAttribute('alt'))

	typedir = "%s/%s" % (xmldir,type)


	if not ref.hasAttribute('uri'):
	#print "# No 'uri' attribute on %s / %s"%(file,type)
	continue
	uri = ref.getAttribute('uri')
	if uri.startswith('urn:'):
	uri = uri[len('urn:'):]
	if uri.startswith('isbn'):
	# assume ISBN can fend for itself
	continue
	if uri.startswith('ISBN'):
	# assume ISBN can fend for itself
	continue
	if not uri.startswith('http'):
	print "# Not a known scheme: %s on %s / %s"%(uri,file,type)
	continue;
	#print uri
	# write the Info file
	file = open("%s.xml"%typedir, 'w')
	file.write( codecs.BOM_UTF8 )
	file.write(ref.toxml().encode( "utf-8" ))
	file.close()
	# make the dir..
	os.mkdir(typedir)

	# already read it?
	if uri in alreadyread.keys():
	already = alreadyread[uri]
	alfile = open("%s/duplicate.txt"%typedir,'w')
	alfile.write( ("%s\n"%already).encode("utf-8"))
	alfile.close()
	else:
	alreadyread[uri] = type
	cmd = "wget -P '%s' -nd -np -k -p '%s' 2>&1 > %s.err"%(typedir,uri,typedir)
	print cmd.encode("utf-8")
	try:
	os.system(cmd)
	except Exception,e:
	exfile = open("%s.exc"%typedir,'w')
	exfile.write( ("exception: %s\n"%str(e)).encode("utf-8"))
	exfile.close()
	print "%s - exception %s"%(typedir,str(e))