# 2021-12-23 import fileinput #import json from sys import stdout skipKeys = ('kCompatibilityVariant', 'kIICore', 'kRSUnicode', 'kHanyuPinyin', 'kXHC1983', 'kHanyuPinlu', 'kTGHZ2013') # keys used = 'c', 'kAccountingNumeric', 'kCantonese', 'kDefinition', 'kHangul', 'kJapaneseKun', 'kJapaneseOn', 'kKorean', 'kMandarin', 'kOtherNumeric', 'kPrimaryNumeric', 'kTang', 'kTotalStrokes', 'kVietnamese' d = {} for line in fileinput.input(files=('Unihan_Readings.txt', 'Unihan_NumericValues.txt', 'Unihan_IRGSources.txt')): if line[0] == "#": continue components = line.split("\t") if len(components) != 3: continue codepoint = components[0] key = components[1] if key[0:4] == 'kIRG' or key in skipKeys: continue value = components[2].rstrip() if codepoint not in d: numeric = int(codepoint[2:], base=16) d[codepoint] = {'c': chr(numeric)} d[codepoint][key] = value #json.dump(d, stdout, ensure_ascii=False, indent=1) def maketableline(key, data): c = data['c'] strokes = data['kTotalStrokes'] num, defn = ('', '') readings = [] if 'kMandarin' in data: readings.append(f'<i>M:</i> {data["kMandarin"]}') if 'kCantonese' in data: readings.append(f'<i>C:</i> {data["kCantonese"]}') if 'kTang' in data: readings.append(f'<i>T:</i> {data["kTang"]}') if 'kJapaneseOn' in data: readings.append(f'<i>J-On:</i> {data["kJapaneseOn"]}') if 'kJapaneseKun' in data: readings.append(f'<i>J-Kun:</i> {data["kJapaneseKun"]}') korean_readings = [] if 'kHangul' in data: korean_readings.append(data['kHangul'].split(":")[0]) if 'kKorean' in data: korean_readings.append(data['kKorean']) if len(korean_readings) > 0: readings.append(f'<i>K:</i> {" ".join(korean_readings)}') if 'kVietnamese' in data: readings.append(f'<i>V:</i> {data["kVietnamese"]}') reading_string = ', '.join(readings) if 'kPrimaryNumeric' in data: num = data['kPrimaryNumeric'] + ' P' if 'kAccountingNumeric' in data: num = data['kAccountingNumeric'] + ' A' if 'kOtherNumeric' in data: num = data['kOtherNumeric'] + ' O' if 'kDefinition' in data: defn = data['kDefinition'] row_id = key[2:].lower() return f"<tr id=\"{row_id}\"><td>{key}<td>{c}<td>{strokes}<td>{reading_string}<td>{num}<td>{defn}</tr>" block_start = "2F800" block_end = "2FA1F" block_name = "CJK Compatibility Ideographs Supplement" print("""<!DOCTYPE html><html lang=en><head><meta charset=utf-8> <meta name=viewport content="width=device-width,initial-scale=1">\n""" + f"<title>Unihan readings, {block_start} to {block_end} ({block_name})</title>\n" + """<style> td:nth-child(2) { font-size: large; text-align: center; } td:nth-child(3) { text-align: center; padding-left: 2px; padding-right: 2px; } td:nth-child(4) { max-width: 45%; } td:nth-child(5) { text-align: center; max-width: 12%; word-break: break-word; } td:nth-child(6) { max-width: 30%; } tr:nth-child(odd) { background-color: #eee; color: #000; } tr:nth-child(even) { background-color: #fff; color: #000; } </style>""" + f"</head><body><h1>Unihan readings, {block_start} to {block_end}</h1>\n" + """<p>Unicode version 14. Skips those characters that don't have any reading data. Page created on 2021-12-23.</p> <p>Abbreviations: <i>M</i>, Mandarin reading; <i>C</i>, Cantonese reading; <i>T</i>, Tang dynasty reading; <i>J-On</i>, Japanese On'yomi reading; <i>J-Kun</i>, Japanese Kun'yomi reading; <i>K</i>, Korean reading; <i>V</i>, Vietnamese reading. In the NumVal column, numbers are followed by <i>P</i>, for kPrimaryNumeric (characters which are mainly used for that numeric value); <i>A</i>, for kAccountingNumeric (characters used for accounting purposes, that can't be easily modified into other numeric characters); or <i>O</i>, for kOtherNumeric (for characters that have uncommon numeric uses).</p>\n""" + f"""<p>This page contains the block <i>{block_name}</i>, U+{block_start} to U+{block_end}. For other blocks, see these pages: <a href="unihan-ext-a.html">Extension A (3400–4DBF)</a>, <a href="unihan-main.html">Unified Ideographs (4E00–9FFF)</a>, <a href="unihan-compat.html">Compatibility Ideographs (F900–FAFF)</a>, <a href="unihan-ext-b.html">Extension B (20000–2A6DF)</a>, <a href="unihan-ext-c.html">Extension C (2A700–2B73F)</a>, <a href="unihan-ext-d.html">Extension D (2B740–2B81F)</a>, <a href="unihan-ext-e.html">Extension E (2B820–2CEAF)</a>, <a href="unihan-ext-f.html">Extension F (2CEB0–2EBEF)</a>, <a href="unihan-compat-supplement.html">Compatibility Ideographs Supplement (2F800–2FA1F)</a>, <a href="unihan-ext-g.html">Extension G (30000–3134F)</a>. </p> <table> <tr><th>U+<th>Char<th>Strokes<th>Readings<th>NumVal<th>Definition</tr>""") for (k, v) in sorted(d.items(), key=lambda p: (int(p[0][2:], 16))): if len(v) <= 2: continue value = int(k[2:], 16) if value < int(block_start, 16) or value > int(block_end, 16): continue print(maketableline(k, v)) print("</table></body></html>")