| # (c) 2007-2010 IBM Corporation and Others. All Rights Reserved. |
| # Python module for scanning and mirroring CLDR references. |
| # |
| # Steven R. Loomis. Oct 30th, 2007 |
| # |
| # |
| # usage: refmirror.pl /path/to/cldr/common /path/to/nonexistent/refmirror-output-dir |
| # |
| # note: |
| # - does condense duplicate URLs within a locale, to only download once |
| # (should condense globally.) |
| # - requires 'wget' installed. |
| # |
| # todo: |
| # - only handles <references> formats - so CLDR 1.5 main/ but NOT collation/ |
| # - doesn't escape UTF-8 URLs such as wikipedia ( writes out url in utf-8, does not %-encode ) |
| # - should probably pass "-n 2" or such to wget to shorten hang time |
| # |
| |
| from xml.dom import minidom |
| import sys |
| import os |
| import codecs |
| |
| progname = sys.argv[0] |
| |
| if len(sys.argv) != 3: |
| raise RuntimeError, "Usage: %s <cldrroot> <output dir>"%(sys.argv[0]) |
| |
| cldrdir = sys.argv[1] |
| htmldir = sys.argv[2] |
| |
| print "# creating %s (shouldn't exist)" % htmldir |
| os.mkdir(htmldir) |
| |
| print "# walking %s" % cldrdir |
| |
| dirs = os.walk(cldrdir) |
| |
| for dir in dirs: |
| name = dir[0] |
| subdirs = dir[1] |
| files = dir[2] |
| if(name.endswith("/CVS")): |
| continue |
| leaf=name[len(cldrdir):] |
| if(leaf.startswith('/')): |
| leaf=leaf[1:] |
| print "dir: %s" % str(leaf) |
| out = "%s/%s" % (htmldir,leaf) |
| if(len(leaf)>0): |
| os.mkdir(out) |
| for file in files: |
| if not file.endswith('.xml'): |
| continue |
| |
| # hash of already read items |
| alreadyread = {} |
| |
| # stub? |
| stub = file |
| xmldir = "%s/%s" % (out,file) |
| |
| # read file |
| filepath = "%s/%s"%(name,file) |
| dom = minidom.parse(filepath) |
| |
| nodes = dom.childNodes |
| |
| if not (nodes[1].nodeType == 1): |
| continue |
| refNode = nodes[1].getElementsByTagName('references') |
| |
| if not refNode: |
| #print "## no refnode %s" % filepath |
| continue |
| print "## got refnode %s" % filepath |
| os.mkdir(xmldir) |
| for ref in refNode[0].getElementsByTagName('reference'): |
| #print "## - ref %s" % str(ref) |
| if not ref.hasAttribute('type'): |
| print "## untyped reference in %s" % filepath |
| else: |
| type = ref.getAttribute('type') |
| |
| if ref.hasAttribute('alt'): |
| type = "%s-%s" % (type,ref.getAttribute('alt')) |
| |
| typedir = "%s/%s" % (xmldir,type) |
| |
| |
| if not ref.hasAttribute('uri'): |
| #print "# No 'uri' attribute on %s / %s"%(file,type) |
| continue |
| uri = ref.getAttribute('uri') |
| if uri.startswith('urn:'): |
| uri = uri[len('urn:'):] |
| if uri.startswith('isbn'): |
| # assume ISBN can fend for itself |
| continue |
| if uri.startswith('ISBN'): |
| # assume ISBN can fend for itself |
| continue |
| if not uri.startswith('http'): |
| print "# Not a known scheme: %s on %s / %s"%(uri,file,type) |
| continue; |
| #print uri |
| # write the Info file |
| file = open("%s.xml"%typedir, 'w') |
| file.write( codecs.BOM_UTF8 ) |
| file.write(ref.toxml().encode( "utf-8" )) |
| file.close() |
| # make the dir.. |
| os.mkdir(typedir) |
| |
| # already read it? |
| if uri in alreadyread.keys(): |
| already = alreadyread[uri] |
| alfile = open("%s/duplicate.txt"%typedir,'w') |
| alfile.write( ("%s\n"%already).encode("utf-8")) |
| alfile.close() |
| else: |
| alreadyread[uri] = type |
| cmd = "wget -P '%s' -nd -np -k -p '%s' 2>&1 > %s.err"%(typedir,uri,typedir) |
| print cmd.encode("utf-8") |
| try: |
| os.system(cmd) |
| except Exception,e: |
| exfile = open("%s.exc"%typedir,'w') |
| exfile.write( ("exception: %s\n"%str(e)).encode("utf-8")) |
| exfile.close() |
| print "%s - exception %s"%(typedir,str(e)) |