tools/mk_hyb_file.py - platform/frameworks/minikin - Git at Google

 #!/usr/bin/env python

 # Copyright (C) 2015 The Android Open Source Project
 #
 # Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """
 Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp)
 into binary format. See doc/hyb_file_format.md for more information.

 Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb

 Optional -v parameter turns on verbose debugging.

 """

 from __future__ import print_function

 import io
 import sys
 import struct
 import math
 import getopt


 VERBOSE = False


 if sys.version_info[0] >= 3:
     def unichr(x):
         return chr(x)


 # number of bits required to represent numbers up to n inclusive
 def num_bits(n):
     return 1 + int(math.log(n, 2)) if n > 0 else 0


 class Node:

     def __init__(self):
         self.succ = {}
         self.res = None
         self.fsm_pat = None
         self.fail = None


 # List of free slots, implemented as doubly linked list
 class Freelist:

     def __init__(self):
         self.first = None
         self.last = None
         self.pred = []
         self.succ = []

     def grow(self):
         this = len(self.pred)
         self.pred.append(self.last)
         self.succ.append(None)
         if self.last is None:
             self.first = this
         else:
             self.succ[self.last] = this
         self.last = this

     def next(self, cursor):
         if cursor == 0:
             cursor = self.first
         if cursor is None:
             self.grow()
             result = self.last
         else:
             result = cursor
         return result, self.succ[result]

     def is_free(self, ix):
         while ix >= len(self.pred):
             self.grow()
         return self.pred[ix] != -1

     def use(self, ix):
         if self.pred[ix] is None:
             self.first = self.succ[ix]
         else:
             self.succ[self.pred[ix]] = self.succ[ix]
         if self.succ[ix] is None:
             self.last = self.pred[ix]
         else:
             self.pred[self.succ[ix]] = self.pred[ix]
         if self.pred[ix] == -1:
             assert self.pred[ix] != -1, 'double free!'
         self.pred[ix] = -1


 def combine(a, b):
     if a is None: return b
     if b is None: return a
     if len(b) < len(a): a, b = b, a
     res = b[:len(b) - len(a)]
     for i in range(len(a)):
         res.append(max(a[i], b[i + len(b) - len(a)]))
     return res


 def trim(pattern):
     for ix in range(len(pattern)):
         if pattern[ix] != 0:
             return pattern[ix:]


 def pat_to_binary(pattern):
     return b''.join(struct.pack('B', x) for x in pattern)


 class Hyph:

     def __init__(self):
         self.root = Node()
         self.root.str = '<root>'
         self.node_list = [self.root]

     # Add a pattern (word fragment with numeric codes, such as ".ad4der")
     def add_pat(self, pat):
         lastWasLetter = False
         haveSeenNumber = False
         result = []
         word = ''
         for c in pat:
             if c.isdigit():
                 result.append(int(c))
                 lastWasLetter = False
                 haveSeenNumber = True
             else:
                 word += c
                 if lastWasLetter and haveSeenNumber:
                     result.append(0)
                 lastWasLetter = True
         if lastWasLetter:
             result.append(0)

         self.add_word_res(word, result)

     # Add an exception (word with hyphens, such as "ta-ble")
     def add_exception(self, hyph_word):
         res = []
         word = ['.']
         need_10 = False
         for c in hyph_word:
             if c == '-':
                 res.append(11)
                 need_10 = False
             else:
                 if need_10:
                     res.append(10)
                 word.append(c)
                 need_10 = True
         word.append('.')
         res.append(0)
         res.append(0)
         if VERBOSE:
             print(word, res)
         self.add_word_res(''.join(word), res)

     def add_word_res(self, word, result):
         if VERBOSE:
             print(word, result)

         t = self.root
         s = ''
         for c in word:
             s += c
             if c not in t.succ:
                 new_node = Node()
                 new_node.str = s
                 self.node_list.append(new_node)
                 t.succ[c] = new_node
             t = t.succ[c]
         t.res = result

     def pack(self, node_list, ch_map, use_node=False):
         size = 0
         self.node_map = {}
         nodes = Freelist()
         edges = Freelist()
         edge_start = 1 if use_node else 0
         for node in node_list:
             succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()])
             if len(succ):
                 cursor = 0
                 while True:
                     edge_ix, cursor = edges.next(cursor)
                     ix = edge_ix - succ[0]
                     if (ix >= 0 and nodes.is_free(ix) and
                             all(edges.is_free(ix + s) for s in succ) and
                             ((not use_node) or edges.is_free(ix))):
                         break
             elif use_node:
                 ix, _ = edges.next(0)
                 nodes.is_free(ix)  # actually don't need nodes at all when use_node,
                 # but keep it happy
             else:
                 ix, _ = nodes.next(0)
             node.ix = ix
             self.node_map[ix] = node
             nodes.use(ix)
             size = max(size, ix)
             if use_node:
                 edges.use(ix)
             for s in succ:
                 edges.use(ix + s)
         size += max(ch_map.values()) + 1
         return size

     # return list of nodes in bfs order
     def bfs(self, ch_map):
         result = [self.root]
         ix = 0
         while ix < len(result):
             node = result[ix]
             node.bfs_ix = ix
             mapped = {}
             for c, next in node.succ.items():
                 assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c))
                 mapped[ch_map[c]] = next
             for i in sorted(mapped.keys()):
                 result.append(mapped[i])
             ix += 1
         self.bfs_order = result
         return result

     # suffix compression - convert the trie into an acyclic digraph, merging nodes when
     # the subtries are identical
     def dedup(self):
         uniques = []
         dupmap = {}
         dedup_ix = [0] * len(self.bfs_order)
         for ix in reversed(range(len(self.bfs_order))):
             # construct string representation of node
             node = self.bfs_order[ix]
             if node.res is None:
                 s = ''
             else:
                 s = ''.join(str(c) for c in node.res)
             for c in sorted(node.succ.keys()):
                 succ = node.succ[c]
                 s += ' ' + c + str(dedup_ix[succ.bfs_ix])
             if s in dupmap:
                 dedup_ix[ix] = dupmap[s]
             else:
                 uniques.append(node)
                 dedup_ix[ix] = ix
             dupmap[s] = dedup_ix[ix]
         uniques.reverse()
         print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total')
         return dedup_ix, uniques


 # load the ".pat" file, which contains patterns such as a1b2c3
 def load(fn):
     hyph = Hyph()
     with io.open(fn, encoding='UTF-8') as f:
         for l in f:
             pat = l.strip()
             hyph.add_pat(pat)
     return hyph


 # load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc.
 def load_chr(fn):
     ch_map = {'.': 0}
     with io.open(fn, encoding='UTF-8') as f:
         for i, l in enumerate(f):
             l = l.strip()
             if len(l) > 2:
                 # lowercase maps to multi-character uppercase sequence, ignore uppercase for now
                 l = l[:1]
             else:
                 assert len(l) == 2, 'expected 2 chars in chr'
             for c in l:
                 ch_map[c] = i + 1
     return ch_map


 # load exceptions with explicit hyphens
 def load_hyp(hyph, fn):
     with io.open(fn, encoding='UTF-8') as f:
         for l in f:
             hyph.add_exception(l.strip())


 def generate_header(alphabet, trie, pattern):
     alphabet_off = 6 * 4
     trie_off = alphabet_off + len(alphabet)
     pattern_off = trie_off + len(trie)
     file_size = pattern_off + len(pattern)
     data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size]
     return struct.pack('<6I', *data)


 def generate_alphabet(ch_map):
     ch_map = ch_map.copy()
     del ch_map['.']
     min_ch = ord(min(ch_map))
     max_ch = ord(max(ch_map))
     if max_ch - min_ch < 1024 and max(ch_map.values()) < 256:
         # generate format 0
         data = [0] * (max_ch - min_ch + 1)
         for c, val in ch_map.items():
             data[ord(c) - min_ch] = val
         result = [struct.pack('<3I', 0, min_ch, max_ch + 1)]
         for b in data:
             result.append(struct.pack('<B', b))
     else:
         # generate format 1
         assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded'
         result = [struct.pack('<2I', 1, len(ch_map))]
         for c, val in sorted(ch_map.items()):
             data = (ord(c) << 11) | val
             result.append(struct.pack('<I', data))
     binary = b''.join(result)
     if len(binary) % 4 != 0:
         binary += b'\x00' * (4 - len(binary) % 4)
     return binary


 # assumes hyph structure has been packed, ie node.ix values have been set
 def generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap):
     ch_array = [0] * n_trie
     link_array = [0] * n_trie
     pat_array = [0] * n_trie
     link_shift = num_bits(max(ch_map.values()))
     char_mask = (1 << link_shift) - 1
     pattern_shift = link_shift + num_bits(n_trie - 1)
     link_mask = (1 << pattern_shift) - (1 << link_shift)
     result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)]

     for node in dedup_nodes:
         ix = node.ix
         if node.res is not None:
             pat_array[ix] = patmap[pat_to_binary(node.res)]
         for c, next in node.succ.items():
             c_num = ch_map[c]
             link_ix = ix + c_num
             ch_array[link_ix] = c_num
             if dedup_ix is None:
                 dedup_next = next
             else:
                 dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]]
             link_array[link_ix] = dedup_next.ix

     for i in range(n_trie):
         #print((pat_array[i], link_array[i], ch_array[i]))
         packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i]
         result.append(struct.pack('<I', packed))
     return b''.join(result)


 def generate_pattern(pats):
     pat_array = [0]
     patmap = {b'': 0}

     raw_pat_array = []
     raw_pat_size = 0
     raw_patmap = {}

     for pat in pats:
         if pat is None:
             continue
         pat_str = pat_to_binary(pat)
         if pat_str not in patmap:
             shift = 0
             while shift < len(pat) and pat[len(pat) - shift - 1] == 0:
                 shift += 1
             rawpat = pat_str[:len(pat) - shift]
             if rawpat not in raw_patmap:
                 raw_patmap[rawpat] = raw_pat_size
                 raw_pat_array.append(rawpat)
                 raw_pat_size += len(rawpat)
             data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat]
             patmap[pat_str] = len(pat_array)
             pat_array.append(data)
     data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size]
     result = [struct.pack('<4I', *data)]
     for x in pat_array:
         result.append(struct.pack('<I', x))
     result.extend(raw_pat_array)
     return patmap, b''.join(result)


 def generate_hyb_file(hyph, ch_map, hyb_fn):
     bfs = hyph.bfs(ch_map)
     dedup_ix, dedup_nodes = hyph.dedup()
     n_trie = hyph.pack(dedup_nodes, ch_map)
     alphabet = generate_alphabet(ch_map)
     patmap, pattern = generate_pattern([n.res for n in hyph.node_list])
     trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap)
     header = generate_header(alphabet, trie, pattern)

     with open(hyb_fn, 'wb') as f:
         f.write(header)
         f.write(alphabet)
         f.write(trie)
         f.write(pattern)


 # Verify that the file contains the same lines as the lines argument, in arbitrary order
 def verify_file_sorted(lines, fn):
     file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')]
     line_set = set(lines)
     file_set = set(file_lines)
     if line_set == file_set:
         return True
     for line in line_set - file_set:
         print(repr(line) + ' in reconstruction, not in file')
     for line in file_set - line_set:
         print(repr(line) + ' in file, not in reconstruction')
     return False


 def map_to_chr(alphabet_map):
     result = []
     ch_map = {}
     for val in alphabet_map.values():
         chs = [ch for ch in alphabet_map if alphabet_map[ch] == val]
         # non-cased characters (like Ethopic) are in both, matching chr file
         lowercase = [ch for ch in chs if not ch.isupper()]
         uppercase = [ch for ch in chs if not ch.islower()]
         # print(val, `lowercase`, `uppercase`)
         assert len(lowercase) == 1, 'expected 1 lowercase character'
         assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character'
         ch_map[val] = lowercase[0]
         result.append(''.join(lowercase + uppercase))
     ch_map[0] = '.'
     return (ch_map, result)


 def get_pattern(pattern_data, ix):
     pattern_offset = struct.unpack('<I', pattern_data[8:12])[0]
     entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0]
     pat_len = entry >> 26
     pat_shift = (entry >> 20) & 0x1f
     offset = pattern_offset + (entry & 0xfffff)
     return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift


 def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions):
     (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20])
     node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0]
     pattern = node_entry >> pattern_shift
     if pattern:
         result = []
         is_exception = False
         pat = get_pattern(pattern_data, pattern)
         for i in range(len(s) + 1):
             pat_off = i - 1 + len(pat) - len(s)
             if pat_off < 0:
                 code = 0
             else:
                 code = struct.unpack('B', pat[pat_off : pat_off + 1])[0]
             if 1 <= code <= 9:
                 result.append('%d' % code)
             elif code == 10:
                 is_exception = True
             elif code == 11:
                 result.append('-')
                 is_exception = True
             else:
                 assert code == 0, 'unexpected code'
             if i < len(s):
                 result.append(s[i])
         pat_str = ''.join(result)
         #print(`pat_str`, `pat`)
         if is_exception:
             assert pat_str[0] == '.', "expected leading '.'"
             assert pat_str[-1] == '.', "expected trailing '.'"
             exceptions.append(pat_str[1:-1])  # strip leading and trailing '.'
         else:
             patterns.append(pat_str)
     for ch in ch_map:
         edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0]
         link = (edge_entry & link_mask) >> link_shift
         if link != 0 and ch == (edge_entry & char_mask):
             sch = s + ch_map[ch]
             traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions)


 # Verify the generated binary file by reconstructing the textual representations
 # from the binary hyb file, then checking that they're identical (mod the order of
 # lines within the file, which is irrelevant). This function makes assumptions that
 # are stronger than absolutely necessary (in particular, that the patterns are in
 # lowercase as defined by python islower).
 def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn):
     with open(hyb_fn, 'rb') as f:
         hyb_data = f.read()
     header = hyb_data[0: 6 * 4]
     (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header)
     alphabet_data = hyb_data[alphabet_off:trie_off]
     trie_data = hyb_data[trie_off:pattern_off]
     pattern_data = hyb_data[pattern_off:file_size]

     # reconstruct alphabet table
     alphabet_version = struct.unpack('<I', alphabet_data[:4])[0]
     alphabet_map = {}
     if alphabet_version == 0:
         (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12])
         for ch in range(min_ch, max_ch):
             offset = 12 + ch - min_ch
             b = struct.unpack('B', alphabet_data[offset : offset + 1])[0]
             if b != 0:
                 alphabet_map[unichr(ch)] = b
     else:
         assert alphabet_version == 1
         n_entries = struct.unpack('<I', alphabet_data[4:8])[0]
         for i in range(n_entries):
             entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0]
             alphabet_map[unichr(entry >> 11)] = entry & 0x7ff

     ch_map, reconstructed_chr = map_to_chr(alphabet_map)

     # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587
     if u'\u0587' in reconstructed_chr:
         reconstructed_chr.remove(u'\u0587')
         reconstructed_chr.append(u'\u0587\u0535\u0552')

     assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified'

     # reconstruct trie
     patterns = []
     exceptions = []
     traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions)

     # EXCEPTION for Bulgarian (bg), which contains an ineffectual line of <0, U+044C, 0>
     if u'\u044c' in patterns:
         patterns.remove(u'\u044c')
         patterns.append(u'0\u044c0')

     assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified'
     assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified'


 def main():
     global VERBOSE
     try:
         opts, args = getopt.getopt(sys.argv[1:], 'v')
     except getopt.GetoptError as err:
         print(str(err))
         sys.exit(1)
     for o, _ in opts:
         if o == '-v':
             VERBOSE = True
     pat_fn, out_fn = args
     hyph = load(pat_fn)
     if pat_fn.endswith('.pat.txt'):
         chr_fn = pat_fn[:-8] + '.chr.txt'
         ch_map = load_chr(chr_fn)
         hyp_fn = pat_fn[:-8] + '.hyp.txt'
         load_hyp(hyph, hyp_fn)
         generate_hyb_file(hyph, ch_map, out_fn)
         verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn)

 if __name__ == '__main__':
     main()
	#!/usr/bin/env python

	# Copyright (C) 2015 The Android Open Source Project
	#
	# Licensed under the Apache License, Version 2.0 (the 'License');
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an 'AS IS' BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp)
	into binary format. See doc/hyb_file_format.md for more information.

	Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb

	Optional -v parameter turns on verbose debugging.

	"""

	from __future__ import print_function

	import io
	import sys
	import struct
	import math
	import getopt


	VERBOSE = False


	if sys.version_info[0] >= 3:
	def unichr(x):
	return chr(x)


	# number of bits required to represent numbers up to n inclusive
	def num_bits(n):
	return 1 + int(math.log(n, 2)) if n > 0 else 0


	class Node:

	def __init__(self):
	self.succ = {}
	self.res = None
	self.fsm_pat = None
	self.fail = None


	# List of free slots, implemented as doubly linked list
	class Freelist:

	def __init__(self):
	self.first = None
	self.last = None
	self.pred = []
	self.succ = []

	def grow(self):
	this = len(self.pred)
	self.pred.append(self.last)
	self.succ.append(None)
	if self.last is None:
	self.first = this
	else:
	self.succ[self.last] = this
	self.last = this

	def next(self, cursor):
	if cursor == 0:
	cursor = self.first
	if cursor is None:
	self.grow()
	result = self.last
	else:
	result = cursor
	return result, self.succ[result]

	def is_free(self, ix):
	while ix >= len(self.pred):
	self.grow()
	return self.pred[ix] != -1

	def use(self, ix):
	if self.pred[ix] is None:
	self.first = self.succ[ix]
	else:
	self.succ[self.pred[ix]] = self.succ[ix]
	if self.succ[ix] is None:
	self.last = self.pred[ix]
	else:
	self.pred[self.succ[ix]] = self.pred[ix]
	if self.pred[ix] == -1:
	assert self.pred[ix] != -1, 'double free!'
	self.pred[ix] = -1


	def combine(a, b):
	if a is None: return b
	if b is None: return a
	if len(b) < len(a): a, b = b, a
	res = b[:len(b) - len(a)]
	for i in range(len(a)):
	res.append(max(a[i], b[i + len(b) - len(a)]))
	return res


	def trim(pattern):
	for ix in range(len(pattern)):
	if pattern[ix] != 0:
	return pattern[ix:]


	def pat_to_binary(pattern):
	return b''.join(struct.pack('B', x) for x in pattern)


	class Hyph:

	def __init__(self):
	self.root = Node()
	self.root.str = '<root>'
	self.node_list = [self.root]

	# Add a pattern (word fragment with numeric codes, such as ".ad4der")
	def add_pat(self, pat):
	lastWasLetter = False
	haveSeenNumber = False
	result = []
	word = ''
	for c in pat:
	if c.isdigit():
	result.append(int(c))
	lastWasLetter = False
	haveSeenNumber = True
	else:
	word += c
	if lastWasLetter and haveSeenNumber:
	result.append(0)
	lastWasLetter = True
	if lastWasLetter:
	result.append(0)

	self.add_word_res(word, result)

	# Add an exception (word with hyphens, such as "ta-ble")
	def add_exception(self, hyph_word):
	res = []
	word = ['.']
	need_10 = False
	for c in hyph_word:
	if c == '-':
	res.append(11)
	need_10 = False
	else:
	if need_10:
	res.append(10)
	word.append(c)
	need_10 = True
	word.append('.')
	res.append(0)
	res.append(0)
	if VERBOSE:
	print(word, res)
	self.add_word_res(''.join(word), res)

	def add_word_res(self, word, result):
	if VERBOSE:
	print(word, result)

	t = self.root
	s = ''
	for c in word:
	s += c
	if c not in t.succ:
	new_node = Node()
	new_node.str = s
	self.node_list.append(new_node)
	t.succ[c] = new_node
	t = t.succ[c]
	t.res = result

	def pack(self, node_list, ch_map, use_node=False):
	size = 0
	self.node_map = {}
	nodes = Freelist()
	edges = Freelist()
	edge_start = 1 if use_node else 0
	for node in node_list:
	succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()])
	if len(succ):
	cursor = 0
	while True:
	edge_ix, cursor = edges.next(cursor)
	ix = edge_ix - succ[0]
	if (ix >= 0 and nodes.is_free(ix) and
	all(edges.is_free(ix + s) for s in succ) and
	((not use_node) or edges.is_free(ix))):
	break
	elif use_node:
	ix, _ = edges.next(0)
	nodes.is_free(ix) # actually don't need nodes at all when use_node,
	# but keep it happy
	else:
	ix, _ = nodes.next(0)
	node.ix = ix
	self.node_map[ix] = node
	nodes.use(ix)
	size = max(size, ix)
	if use_node:
	edges.use(ix)
	for s in succ:
	edges.use(ix + s)
	size += max(ch_map.values()) + 1
	return size

	# return list of nodes in bfs order
	def bfs(self, ch_map):
	result = [self.root]
	ix = 0
	while ix < len(result):
	node = result[ix]
	node.bfs_ix = ix
	mapped = {}
	for c, next in node.succ.items():
	assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c))
	mapped[ch_map[c]] = next
	for i in sorted(mapped.keys()):
	result.append(mapped[i])
	ix += 1
	self.bfs_order = result
	return result

	# suffix compression - convert the trie into an acyclic digraph, merging nodes when
	# the subtries are identical
	def dedup(self):
	uniques = []
	dupmap = {}
	dedup_ix = [0] * len(self.bfs_order)
	for ix in reversed(range(len(self.bfs_order))):
	# construct string representation of node
	node = self.bfs_order[ix]
	if node.res is None:
	s = ''
	else:
	s = ''.join(str(c) for c in node.res)
	for c in sorted(node.succ.keys()):
	succ = node.succ[c]
	s += ' ' + c + str(dedup_ix[succ.bfs_ix])
	if s in dupmap:
	dedup_ix[ix] = dupmap[s]
	else:
	uniques.append(node)
	dedup_ix[ix] = ix
	dupmap[s] = dedup_ix[ix]
	uniques.reverse()
	print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total')
	return dedup_ix, uniques


	# load the ".pat" file, which contains patterns such as a1b2c3
	def load(fn):
	hyph = Hyph()
	with io.open(fn, encoding='UTF-8') as f:
	for l in f:
	pat = l.strip()
	hyph.add_pat(pat)
	return hyph


	# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc.
	def load_chr(fn):
	ch_map = {'.': 0}
	with io.open(fn, encoding='UTF-8') as f:
	for i, l in enumerate(f):
	l = l.strip()
	if len(l) > 2:
	# lowercase maps to multi-character uppercase sequence, ignore uppercase for now
	l = l[:1]
	else:
	assert len(l) == 2, 'expected 2 chars in chr'
	for c in l:
	ch_map[c] = i + 1
	return ch_map


	# load exceptions with explicit hyphens
	def load_hyp(hyph, fn):
	with io.open(fn, encoding='UTF-8') as f:
	for l in f:
	hyph.add_exception(l.strip())


	def generate_header(alphabet, trie, pattern):
	alphabet_off = 6 * 4
	trie_off = alphabet_off + len(alphabet)
	pattern_off = trie_off + len(trie)
	file_size = pattern_off + len(pattern)
	data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size]
	return struct.pack('<6I', *data)


	def generate_alphabet(ch_map):
	ch_map = ch_map.copy()
	del ch_map['.']
	min_ch = ord(min(ch_map))
	max_ch = ord(max(ch_map))
	if max_ch - min_ch < 1024 and max(ch_map.values()) < 256:
	# generate format 0
	data = [0] * (max_ch - min_ch + 1)
	for c, val in ch_map.items():
	data[ord(c) - min_ch] = val
	result = [struct.pack('<3I', 0, min_ch, max_ch + 1)]
	for b in data:
	result.append(struct.pack('<B', b))
	else:
	# generate format 1
	assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded'
	result = [struct.pack('<2I', 1, len(ch_map))]
	for c, val in sorted(ch_map.items()):
	data = (ord(c) << 11) \| val
	result.append(struct.pack('<I', data))
	binary = b''.join(result)
	if len(binary) % 4 != 0:
	binary += b'\x00' * (4 - len(binary) % 4)
	return binary


	# assumes hyph structure has been packed, ie node.ix values have been set
	def generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap):
	ch_array = [0] * n_trie
	link_array = [0] * n_trie
	pat_array = [0] * n_trie
	link_shift = num_bits(max(ch_map.values()))
	char_mask = (1 << link_shift) - 1
	pattern_shift = link_shift + num_bits(n_trie - 1)
	link_mask = (1 << pattern_shift) - (1 << link_shift)
	result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)]

	for node in dedup_nodes:
	ix = node.ix
	if node.res is not None:
	pat_array[ix] = patmap[pat_to_binary(node.res)]
	for c, next in node.succ.items():
	c_num = ch_map[c]
	link_ix = ix + c_num
	ch_array[link_ix] = c_num
	if dedup_ix is None:
	dedup_next = next
	else:
	dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]]
	link_array[link_ix] = dedup_next.ix

	for i in range(n_trie):
	#print((pat_array[i], link_array[i], ch_array[i]))
	packed = (pat_array[i] << pattern_shift) \| (link_array[i] << link_shift) \| ch_array[i]
	result.append(struct.pack('<I', packed))
	return b''.join(result)


	def generate_pattern(pats):
	pat_array = [0]
	patmap = {b'': 0}

	raw_pat_array = []
	raw_pat_size = 0
	raw_patmap = {}

	for pat in pats:
	if pat is None:
	continue
	pat_str = pat_to_binary(pat)
	if pat_str not in patmap:
	shift = 0
	while shift < len(pat) and pat[len(pat) - shift - 1] == 0:
	shift += 1
	rawpat = pat_str[:len(pat) - shift]
	if rawpat not in raw_patmap:
	raw_patmap[rawpat] = raw_pat_size
	raw_pat_array.append(rawpat)
	raw_pat_size += len(rawpat)
	data = (len(rawpat) << 26) \| (shift << 20) \| raw_patmap[rawpat]
	patmap[pat_str] = len(pat_array)
	pat_array.append(data)
	data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size]
	result = [struct.pack('<4I', *data)]
	for x in pat_array:
	result.append(struct.pack('<I', x))
	result.extend(raw_pat_array)
	return patmap, b''.join(result)


	def generate_hyb_file(hyph, ch_map, hyb_fn):
	bfs = hyph.bfs(ch_map)
	dedup_ix, dedup_nodes = hyph.dedup()
	n_trie = hyph.pack(dedup_nodes, ch_map)
	alphabet = generate_alphabet(ch_map)
	patmap, pattern = generate_pattern([n.res for n in hyph.node_list])
	trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap)
	header = generate_header(alphabet, trie, pattern)

	with open(hyb_fn, 'wb') as f:
	f.write(header)
	f.write(alphabet)
	f.write(trie)
	f.write(pattern)


	# Verify that the file contains the same lines as the lines argument, in arbitrary order
	def verify_file_sorted(lines, fn):
	file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')]
	line_set = set(lines)
	file_set = set(file_lines)
	if line_set == file_set:
	return True
	for line in line_set - file_set:
	print(repr(line) + ' in reconstruction, not in file')
	for line in file_set - line_set:
	print(repr(line) + ' in file, not in reconstruction')
	return False


	def map_to_chr(alphabet_map):
	result = []
	ch_map = {}
	for val in alphabet_map.values():
	chs = [ch for ch in alphabet_map if alphabet_map[ch] == val]
	# non-cased characters (like Ethopic) are in both, matching chr file
	lowercase = [ch for ch in chs if not ch.isupper()]
	uppercase = [ch for ch in chs if not ch.islower()]
	# print(val, `lowercase`, `uppercase`)
	assert len(lowercase) == 1, 'expected 1 lowercase character'
	assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character'
	ch_map[val] = lowercase[0]
	result.append(''.join(lowercase + uppercase))
	ch_map[0] = '.'
	return (ch_map, result)


	def get_pattern(pattern_data, ix):
	pattern_offset = struct.unpack('<I', pattern_data[8:12])[0]
	entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0]
	pat_len = entry >> 26
	pat_shift = (entry >> 20) & 0x1f
	offset = pattern_offset + (entry & 0xfffff)
	return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift


	def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions):
	(char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20])
	node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0]
	pattern = node_entry >> pattern_shift
	if pattern:
	result = []
	is_exception = False
	pat = get_pattern(pattern_data, pattern)
	for i in range(len(s) + 1):
	pat_off = i - 1 + len(pat) - len(s)
	if pat_off < 0:
	code = 0
	else:
	code = struct.unpack('B', pat[pat_off : pat_off + 1])[0]
	if 1 <= code <= 9:
	result.append('%d' % code)
	elif code == 10:
	is_exception = True
	elif code == 11:
	result.append('-')
	is_exception = True
	else:
	assert code == 0, 'unexpected code'
	if i < len(s):
	result.append(s[i])
	pat_str = ''.join(result)
	#print(`pat_str`, `pat`)
	if is_exception:
	assert pat_str[0] == '.', "expected leading '.'"
	assert pat_str[-1] == '.', "expected trailing '.'"
	exceptions.append(pat_str[1:-1]) # strip leading and trailing '.'
	else:
	patterns.append(pat_str)
	for ch in ch_map:
	edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0]
	link = (edge_entry & link_mask) >> link_shift
	if link != 0 and ch == (edge_entry & char_mask):
	sch = s + ch_map[ch]
	traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions)


	# Verify the generated binary file by reconstructing the textual representations
	# from the binary hyb file, then checking that they're identical (mod the order of
	# lines within the file, which is irrelevant). This function makes assumptions that
	# are stronger than absolutely necessary (in particular, that the patterns are in
	# lowercase as defined by python islower).
	def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn):
	with open(hyb_fn, 'rb') as f:
	hyb_data = f.read()
	header = hyb_data[0: 6 * 4]
	(magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header)
	alphabet_data = hyb_data[alphabet_off:trie_off]
	trie_data = hyb_data[trie_off:pattern_off]
	pattern_data = hyb_data[pattern_off:file_size]

	# reconstruct alphabet table
	alphabet_version = struct.unpack('<I', alphabet_data[:4])[0]
	alphabet_map = {}
	if alphabet_version == 0:
	(min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12])
	for ch in range(min_ch, max_ch):
	offset = 12 + ch - min_ch
	b = struct.unpack('B', alphabet_data[offset : offset + 1])[0]
	if b != 0:
	alphabet_map[unichr(ch)] = b
	else:
	assert alphabet_version == 1
	n_entries = struct.unpack('<I', alphabet_data[4:8])[0]
	for i in range(n_entries):
	entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0]
	alphabet_map[unichr(entry >> 11)] = entry & 0x7ff

	ch_map, reconstructed_chr = map_to_chr(alphabet_map)

	# EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587
	if u'\u0587' in reconstructed_chr:
	reconstructed_chr.remove(u'\u0587')
	reconstructed_chr.append(u'\u0587\u0535\u0552')

	assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified'

	# reconstruct trie
	patterns = []
	exceptions = []
	traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions)

	# EXCEPTION for Bulgarian (bg), which contains an ineffectual line of <0, U+044C, 0>
	if u'\u044c' in patterns:
	patterns.remove(u'\u044c')
	patterns.append(u'0\u044c0')

	assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified'
	assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified'


	def main():
	global VERBOSE
	try:
	opts, args = getopt.getopt(sys.argv[1:], 'v')
	except getopt.GetoptError as err:
	print(str(err))
	sys.exit(1)
	for o, _ in opts:
	if o == '-v':
	VERBOSE = True
	pat_fn, out_fn = args
	hyph = load(pat_fn)
	if pat_fn.endswith('.pat.txt'):
	chr_fn = pat_fn[:-8] + '.chr.txt'
	ch_map = load_chr(chr_fn)
	hyp_fn = pat_fn[:-8] + '.hyp.txt'
	load_hyp(hyph, hyp_fn)
	generate_hyb_file(hyph, ch_map, out_fn)
	verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn)

	if __name__ == '__main__':
	main()