blob: ab46eaed5971fb610cf751aabc3b911d1d7b2e29 [file] [log] [blame]
#!/usr/local/bin/clisp -C
;;; Creation of gnulib's uninames.h from the UnicodeData.txt table.
;;; Copyright (C) 2000-2020 Free Software Foundation, Inc.
;;; Written by Bruno Haible <bruno@clisp.org>, 2000-12-28.
;;;
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License as published by
;;; the Free Software Foundation; either version 3 of the License, or
;;; (at your option) any later version.
;;;
;;; This program is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with this program. If not, see <https://www.gnu.org/licenses/>.
(defparameter add-comments nil)
(defstruct unicode-char
(index nil :type integer)
(name nil :type string)
word-indices
word-indices-index
)
(defstruct range
(index nil :type integer)
(start-code nil :type integer)
(end-code nil :type integer)
)
(defstruct word-list
(hashed nil :type hash-table)
(sorted nil :type list)
size ; number of characters total
length ; number of words
)
(defun main (inputfile outputfile aliasfile)
(declare (type string inputfile outputfile aliasfile))
#+UNICODE (setq *default-file-encoding* charset:utf-8)
(let ((all-chars '())
(all-chars-hashed (make-hash-table :test #'equal))
(all-aliases '())
all-chars-and-aliases
(all-ranges '())
(name-index 0)
range)
;; Read all characters and names from the input file.
(with-open-file (istream inputfile :direction :input)
(loop
(let ((line (read-line istream nil nil)))
(unless line (return))
(let* ((i1 (position #\; line))
(i2 (position #\; line :start (1+ i1)))
(code-string (subseq line 0 i1))
(code (parse-integer code-string :radix 16))
(name-string (subseq line (1+ i1) i2)))
; Ignore characters whose name starts with "<".
(unless (eql (char name-string 0) #\<)
; Also ignore Hangul syllables; they are treated specially.
(unless (<= #xAC00 code #xD7A3)
; Also ignore CJK compatibility ideographs; they are treated
; specially as well.
(unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
(<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
;; Also ignore variationselectors; they are treated
;; specially as well.
(unless (or (<= #xFE00 code #xFE0F) (<= #xE0100 code #xE01EF))
(push (make-unicode-char :index name-index
:name name-string)
all-chars)
(setf (gethash code all-chars-hashed) (car all-chars))
;; Update the contiguous range, or start a new range.
(if (and range (= (1+ (range-end-code range)) code))
(setf (range-end-code range) code)
(progn
(when range
(push range all-ranges))
(setq range (make-range :index name-index
:start-code code
:end-code code))))
(incf name-index)
(setq last-code code)
) ) ) )
) ) ) )
(setq all-chars (nreverse all-chars))
(if range
(push range all-ranges))
(setq all-ranges (nreverse all-ranges))
(when aliasfile
;; Read all characters and names from the alias file.
(with-open-file (istream aliasfile :direction :input)
(loop
(let ((line (read-line istream nil nil)))
(unless line (return))
(let* ((i1 (position #\; line))
(i2 (position #\; line :start (1+ i1)))
(code-string (subseq line 0 i1))
(code (parse-integer code-string :radix 16))
(name-string (subseq line (1+ i1) i2))
(uc (gethash code all-chars-hashed)))
(when uc
(push (make-unicode-char :index (unicode-char-index uc)
:name name-string)
all-aliases)
) ) ) ) ) )
(setq all-aliases (nreverse all-aliases)
all-chars-and-aliases (append all-chars all-aliases))
;; Split into words.
(let ((words-by-length (make-array 0 :adjustable t)))
(dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION"
(mapcar #'unicode-char-name all-chars-and-aliases)))
(let ((i1 0))
(loop
(when (>= i1 (length name)) (return))
(let ((i2 (or (position #\Space name :start i1) (length name))))
(let* ((word (subseq name i1 i2))
(len (length word)))
(when (>= len (length words-by-length))
(adjust-array words-by-length (1+ len))
)
(unless (aref words-by-length len)
(setf (aref words-by-length len)
(make-word-list
:hashed (make-hash-table :test #'equal)
:sorted '()
) ) )
(let ((word-list (aref words-by-length len)))
(unless (gethash word (word-list-hashed word-list))
(setf (gethash word (word-list-hashed word-list)) t)
(push word (word-list-sorted word-list))
) )
)
(setq i1 (1+ i2))
) ) ) )
;; Sort the word lists.
(dotimes (len (length words-by-length))
(unless (aref words-by-length len)
(setf (aref words-by-length len)
(make-word-list
:hashed (make-hash-table :test #'equal)
:sorted '()
) ) )
(let ((word-list (aref words-by-length len)))
(setf (word-list-sorted word-list)
(sort (word-list-sorted word-list) #'string<)
)
(setf (word-list-size word-list)
(reduce #'+ (mapcar #'length (word-list-sorted word-list)))
)
(setf (word-list-length word-list)
(length (word-list-sorted word-list))
) ) )
;; Output the tables.
(with-open-file (ostream outputfile :direction :output
#+UNICODE :external-format #+UNICODE charset:ascii)
(format ostream "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%")
(format ostream "/*~%")
(format ostream " * ~A~%" (file-namestring outputfile))
(format ostream " *~%")
(format ostream " * Unicode character name table.~%")
(format ostream " * Generated automatically by the gen-uninames utility.~%")
(format ostream " */~%")
(format ostream "~%")
(format ostream "static const char unicode_name_words[~D] = {~%"
(let ((sum 0))
(dotimes (len (length words-by-length))
(let ((word-list (aref words-by-length len)))
(incf sum (word-list-size word-list))
) )
sum
) )
(dotimes (len (length words-by-length))
(let ((word-list (aref words-by-length len)))
(dolist (word (word-list-sorted word-list))
(format ostream " ~{ '~C',~}~%" (coerce word 'list))
) ) )
(format ostream "};~%")
(format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
(let ((sum 0))
(dotimes (len (length words-by-length))
(let ((word-list (aref words-by-length len)))
(incf sum (word-list-length word-list))
) )
sum
) )
#| ; Redundant data
(format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
(let ((sum 0))
(dotimes (len (length words-by-length))
(let ((word-list (aref words-by-length len)))
(incf sum (word-list-length word-list))
) )
sum
) )
(dotimes (len (length words-by-length))
(let ((word-list (aref words-by-length len)))
(when (word-list-sorted word-list)
(format ostream " ")
(do ((l (word-list-sorted word-list) (cdr l))
(offset 0 (+ offset (length (car l)))))
((endp l))
(format ostream "~<~% ~0,79:; ~D,~>" offset)
)
(format ostream "~%")
) ) )
(format ostream "};~%")
|#
(format ostream "static const struct { uint32_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
(1+ (length words-by-length))
)
(let ((extra-offset 0)
(ind-offset 0))
(dotimes (len (length words-by-length))
(let ((word-list (aref words-by-length len)))
(format ostream " { ~D, ~D },~%" extra-offset ind-offset)
(incf extra-offset (word-list-size word-list))
(incf ind-offset (word-list-length word-list))
) )
(format ostream " { ~D, ~D }~%" extra-offset ind-offset)
)
(format ostream "};~%")
(let ((ind-offset 0))
(dotimes (len (length words-by-length))
(let ((word-list (aref words-by-length len)))
(dolist (word (word-list-sorted word-list))
(setf (gethash word (word-list-hashed word-list)) ind-offset)
(incf ind-offset)
) ) ) )
(dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION"))
(format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
(gethash word (word-list-hashed (aref words-by-length (length word))))
) )
;; Compute the word-indices for every unicode-char.
(dolist (uc all-chars-and-aliases)
(let ((name (unicode-char-name uc))
(indices '()))
(let ((i1 0))
(loop
(when (>= i1 (length name)) (return))
(let ((i2 (or (position #\Space name :start i1) (length name))))
(let* ((word (subseq name i1 i2))
(len (length word)))
(push (gethash word (word-list-hashed (aref words-by-length len)))
indices
)
)
(setq i1 (1+ i2))
) ) )
(setf (unicode-char-word-indices uc)
(coerce (nreverse indices) 'vector)
)
) )
;; Sort the list of unicode-chars by word-indices.
(setq all-chars-and-aliases
(sort all-chars-and-aliases
(lambda (vec1 vec2)
(let ((len1 (length vec1))
(len2 (length vec2)))
(do ((i 0 (1+ i)))
(nil)
(if (< i len2)
(if (< i len1)
(cond ((< (aref vec1 i) (aref vec2 i)) (return t))
((> (aref vec1 i) (aref vec2 i)) (return nil))
)
(return t)
)
(return nil)
) ) ) )
:key #'unicode-char-word-indices
) )
;; Output the word-indices.
(format ostream "static const uint16_t unicode_names[~D] = {~%"
(reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
)
(let ((i 0))
(dolist (uc all-chars-and-aliases)
(format ostream " ~{ ~D,~}"
(maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
(coerce (unicode-char-word-indices uc) 'list)
)
)
(when add-comments
(format ostream "~40T/* ~A */" (unicode-char-name uc))
)
(format ostream "~%")
(setf (unicode-char-word-indices-index uc) i)
(incf i (length (unicode-char-word-indices uc)))
) )
(format ostream "};~%")
(format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
(format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
(format ostream "unicode_name_to_index[~D] = {~%"
(length all-chars-and-aliases)
)
(dolist (uc all-chars-and-aliases)
(format ostream " { 0x~4,'0X, ~D },"
(unicode-char-index uc)
(unicode-char-word-indices-index uc)
)
(when add-comments
(format ostream "~21T/* ~A */" (unicode-char-name uc))
)
(format ostream "~%")
)
(format ostream "};~%")
(format ostream "static const struct { uint16_t index; uint32_t name:24; }~%")
(format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
(format ostream "unicode_index_to_name[~D] = {~%"
(length all-chars)
)
(dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index))
(format ostream " { 0x~4,'0X, ~D },"
(unicode-char-index uc)
(unicode-char-word-indices-index uc)
)
(when add-comments
(format ostream "~21T/* ~A */" (unicode-char-name uc))
)
(format ostream "~%")
)
(format ostream "};~%")
(format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
(reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases))
)
(format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
(reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
)
(format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%"
(length all-ranges))
(dolist (range all-ranges)
(format ostream " { ~D, ~D, ~D },~%"
(range-index range)
(- (range-start-code range) (range-index range))
(1+ (- (range-end-code range) (range-start-code range))))
)
(format ostream "};~%")
)
) ) )
(main (first *args*) (second *args*) (third *args*))