| #!/usr/local/bin/clisp -C |
| |
| ;;; Creation of gnulib's uninames.h from the UnicodeData.txt table. |
| |
| ;;; Copyright (C) 2000-2020 Free Software Foundation, Inc. |
| ;;; Written by Bruno Haible <bruno@clisp.org>, 2000-12-28. |
| ;;; |
| ;;; This program is free software: you can redistribute it and/or modify |
| ;;; it under the terms of the GNU General Public License as published by |
| ;;; the Free Software Foundation; either version 3 of the License, or |
| ;;; (at your option) any later version. |
| ;;; |
| ;;; This program is distributed in the hope that it will be useful, |
| ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| ;;; GNU General Public License for more details. |
| ;;; |
| ;;; You should have received a copy of the GNU General Public License |
| ;;; along with this program. If not, see <https://www.gnu.org/licenses/>. |
| |
| (defparameter add-comments nil) |
| |
| (defstruct unicode-char |
| (index nil :type integer) |
| (name nil :type string) |
| word-indices |
| word-indices-index |
| ) |
| |
| (defstruct range |
| (index nil :type integer) |
| (start-code nil :type integer) |
| (end-code nil :type integer) |
| ) |
| |
| (defstruct word-list |
| (hashed nil :type hash-table) |
| (sorted nil :type list) |
| size ; number of characters total |
| length ; number of words |
| ) |
| |
| (defun main (inputfile outputfile aliasfile) |
| (declare (type string inputfile outputfile aliasfile)) |
| #+UNICODE (setq *default-file-encoding* charset:utf-8) |
| (let ((all-chars '()) |
| (all-chars-hashed (make-hash-table :test #'equal)) |
| (all-aliases '()) |
| all-chars-and-aliases |
| (all-ranges '()) |
| (name-index 0) |
| range) |
| ;; Read all characters and names from the input file. |
| (with-open-file (istream inputfile :direction :input) |
| (loop |
| (let ((line (read-line istream nil nil))) |
| (unless line (return)) |
| (let* ((i1 (position #\; line)) |
| (i2 (position #\; line :start (1+ i1))) |
| (code-string (subseq line 0 i1)) |
| (code (parse-integer code-string :radix 16)) |
| (name-string (subseq line (1+ i1) i2))) |
| ; Ignore characters whose name starts with "<". |
| (unless (eql (char name-string 0) #\<) |
| ; Also ignore Hangul syllables; they are treated specially. |
| (unless (<= #xAC00 code #xD7A3) |
| ; Also ignore CJK compatibility ideographs; they are treated |
| ; specially as well. |
| (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A) |
| (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D)) |
| ;; Also ignore variationselectors; they are treated |
| ;; specially as well. |
| (unless (or (<= #xFE00 code #xFE0F) (<= #xE0100 code #xE01EF)) |
| (push (make-unicode-char :index name-index |
| :name name-string) |
| all-chars) |
| (setf (gethash code all-chars-hashed) (car all-chars)) |
| ;; Update the contiguous range, or start a new range. |
| (if (and range (= (1+ (range-end-code range)) code)) |
| (setf (range-end-code range) code) |
| (progn |
| (when range |
| (push range all-ranges)) |
| (setq range (make-range :index name-index |
| :start-code code |
| :end-code code)))) |
| (incf name-index) |
| (setq last-code code) |
| ) ) ) ) |
| ) ) ) ) |
| (setq all-chars (nreverse all-chars)) |
| (if range |
| (push range all-ranges)) |
| (setq all-ranges (nreverse all-ranges)) |
| (when aliasfile |
| ;; Read all characters and names from the alias file. |
| (with-open-file (istream aliasfile :direction :input) |
| (loop |
| (let ((line (read-line istream nil nil))) |
| (unless line (return)) |
| (let* ((i1 (position #\; line)) |
| (i2 (position #\; line :start (1+ i1))) |
| (code-string (subseq line 0 i1)) |
| (code (parse-integer code-string :radix 16)) |
| (name-string (subseq line (1+ i1) i2)) |
| (uc (gethash code all-chars-hashed))) |
| (when uc |
| (push (make-unicode-char :index (unicode-char-index uc) |
| :name name-string) |
| all-aliases) |
| ) ) ) ) ) ) |
| (setq all-aliases (nreverse all-aliases) |
| all-chars-and-aliases (append all-chars all-aliases)) |
| ;; Split into words. |
| (let ((words-by-length (make-array 0 :adjustable t))) |
| (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION" |
| (mapcar #'unicode-char-name all-chars-and-aliases))) |
| (let ((i1 0)) |
| (loop |
| (when (>= i1 (length name)) (return)) |
| (let ((i2 (or (position #\Space name :start i1) (length name)))) |
| (let* ((word (subseq name i1 i2)) |
| (len (length word))) |
| (when (>= len (length words-by-length)) |
| (adjust-array words-by-length (1+ len)) |
| ) |
| (unless (aref words-by-length len) |
| (setf (aref words-by-length len) |
| (make-word-list |
| :hashed (make-hash-table :test #'equal) |
| :sorted '() |
| ) ) ) |
| (let ((word-list (aref words-by-length len))) |
| (unless (gethash word (word-list-hashed word-list)) |
| (setf (gethash word (word-list-hashed word-list)) t) |
| (push word (word-list-sorted word-list)) |
| ) ) |
| ) |
| (setq i1 (1+ i2)) |
| ) ) ) ) |
| ;; Sort the word lists. |
| (dotimes (len (length words-by-length)) |
| (unless (aref words-by-length len) |
| (setf (aref words-by-length len) |
| (make-word-list |
| :hashed (make-hash-table :test #'equal) |
| :sorted '() |
| ) ) ) |
| (let ((word-list (aref words-by-length len))) |
| (setf (word-list-sorted word-list) |
| (sort (word-list-sorted word-list) #'string<) |
| ) |
| (setf (word-list-size word-list) |
| (reduce #'+ (mapcar #'length (word-list-sorted word-list))) |
| ) |
| (setf (word-list-length word-list) |
| (length (word-list-sorted word-list)) |
| ) ) ) |
| ;; Output the tables. |
| (with-open-file (ostream outputfile :direction :output |
| #+UNICODE :external-format #+UNICODE charset:ascii) |
| (format ostream "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%") |
| (format ostream "/*~%") |
| (format ostream " * ~A~%" (file-namestring outputfile)) |
| (format ostream " *~%") |
| (format ostream " * Unicode character name table.~%") |
| (format ostream " * Generated automatically by the gen-uninames utility.~%") |
| (format ostream " */~%") |
| (format ostream "~%") |
| (format ostream "static const char unicode_name_words[~D] = {~%" |
| (let ((sum 0)) |
| (dotimes (len (length words-by-length)) |
| (let ((word-list (aref words-by-length len))) |
| (incf sum (word-list-size word-list)) |
| ) ) |
| sum |
| ) ) |
| (dotimes (len (length words-by-length)) |
| (let ((word-list (aref words-by-length len))) |
| (dolist (word (word-list-sorted word-list)) |
| (format ostream " ~{ '~C',~}~%" (coerce word 'list)) |
| ) ) ) |
| (format ostream "};~%") |
| (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%" |
| (let ((sum 0)) |
| (dotimes (len (length words-by-length)) |
| (let ((word-list (aref words-by-length len))) |
| (incf sum (word-list-length word-list)) |
| ) ) |
| sum |
| ) ) |
| #| ; Redundant data |
| (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%" |
| (let ((sum 0)) |
| (dotimes (len (length words-by-length)) |
| (let ((word-list (aref words-by-length len))) |
| (incf sum (word-list-length word-list)) |
| ) ) |
| sum |
| ) ) |
| (dotimes (len (length words-by-length)) |
| (let ((word-list (aref words-by-length len))) |
| (when (word-list-sorted word-list) |
| (format ostream " ") |
| (do ((l (word-list-sorted word-list) (cdr l)) |
| (offset 0 (+ offset (length (car l))))) |
| ((endp l)) |
| (format ostream "~<~% ~0,79:; ~D,~>" offset) |
| ) |
| (format ostream "~%") |
| ) ) ) |
| (format ostream "};~%") |
| |# |
| (format ostream "static const struct { uint32_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%" |
| (1+ (length words-by-length)) |
| ) |
| (let ((extra-offset 0) |
| (ind-offset 0)) |
| (dotimes (len (length words-by-length)) |
| (let ((word-list (aref words-by-length len))) |
| (format ostream " { ~D, ~D },~%" extra-offset ind-offset) |
| (incf extra-offset (word-list-size word-list)) |
| (incf ind-offset (word-list-length word-list)) |
| ) ) |
| (format ostream " { ~D, ~D }~%" extra-offset ind-offset) |
| ) |
| (format ostream "};~%") |
| (let ((ind-offset 0)) |
| (dotimes (len (length words-by-length)) |
| (let ((word-list (aref words-by-length len))) |
| (dolist (word (word-list-sorted word-list)) |
| (setf (gethash word (word-list-hashed word-list)) ind-offset) |
| (incf ind-offset) |
| ) ) ) ) |
| (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION")) |
| (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word |
| (gethash word (word-list-hashed (aref words-by-length (length word)))) |
| ) ) |
| ;; Compute the word-indices for every unicode-char. |
| (dolist (uc all-chars-and-aliases) |
| (let ((name (unicode-char-name uc)) |
| (indices '())) |
| (let ((i1 0)) |
| (loop |
| (when (>= i1 (length name)) (return)) |
| (let ((i2 (or (position #\Space name :start i1) (length name)))) |
| (let* ((word (subseq name i1 i2)) |
| (len (length word))) |
| (push (gethash word (word-list-hashed (aref words-by-length len))) |
| indices |
| ) |
| ) |
| (setq i1 (1+ i2)) |
| ) ) ) |
| (setf (unicode-char-word-indices uc) |
| (coerce (nreverse indices) 'vector) |
| ) |
| ) ) |
| ;; Sort the list of unicode-chars by word-indices. |
| (setq all-chars-and-aliases |
| (sort all-chars-and-aliases |
| (lambda (vec1 vec2) |
| (let ((len1 (length vec1)) |
| (len2 (length vec2))) |
| (do ((i 0 (1+ i))) |
| (nil) |
| (if (< i len2) |
| (if (< i len1) |
| (cond ((< (aref vec1 i) (aref vec2 i)) (return t)) |
| ((> (aref vec1 i) (aref vec2 i)) (return nil)) |
| ) |
| (return t) |
| ) |
| (return nil) |
| ) ) ) ) |
| :key #'unicode-char-word-indices |
| ) ) |
| ;; Output the word-indices. |
| (format ostream "static const uint16_t unicode_names[~D] = {~%" |
| (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases)) |
| ) |
| (let ((i 0)) |
| (dolist (uc all-chars-and-aliases) |
| (format ostream " ~{ ~D,~}" |
| (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0))) |
| (coerce (unicode-char-word-indices uc) 'list) |
| ) |
| ) |
| (when add-comments |
| (format ostream "~40T/* ~A */" (unicode-char-name uc)) |
| ) |
| (format ostream "~%") |
| (setf (unicode-char-word-indices-index uc) i) |
| (incf i (length (unicode-char-word-indices uc))) |
| ) ) |
| (format ostream "};~%") |
| (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%") |
| (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%") |
| (format ostream "unicode_name_to_index[~D] = {~%" |
| (length all-chars-and-aliases) |
| ) |
| (dolist (uc all-chars-and-aliases) |
| (format ostream " { 0x~4,'0X, ~D }," |
| (unicode-char-index uc) |
| (unicode-char-word-indices-index uc) |
| ) |
| (when add-comments |
| (format ostream "~21T/* ~A */" (unicode-char-name uc)) |
| ) |
| (format ostream "~%") |
| ) |
| (format ostream "};~%") |
| (format ostream "static const struct { uint16_t index; uint32_t name:24; }~%") |
| (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%") |
| (format ostream "unicode_index_to_name[~D] = {~%" |
| (length all-chars) |
| ) |
| (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index)) |
| (format ostream " { 0x~4,'0X, ~D }," |
| (unicode-char-index uc) |
| (unicode-char-word-indices-index uc) |
| ) |
| (when add-comments |
| (format ostream "~21T/* ~A */" (unicode-char-name uc)) |
| ) |
| (format ostream "~%") |
| ) |
| (format ostream "};~%") |
| (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%" |
| (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases)) |
| ) |
| (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%" |
| (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases)) |
| ) |
| (format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%" |
| (length all-ranges)) |
| (dolist (range all-ranges) |
| (format ostream " { ~D, ~D, ~D },~%" |
| (range-index range) |
| (- (range-start-code range) (range-index range)) |
| (1+ (- (range-end-code range) (range-start-code range)))) |
| ) |
| (format ostream "};~%") |
| ) |
| ) ) ) |
| |
| (main (first *args*) (second *args*) (third *args*)) |