summaryrefslogtreecommitdiffstats
path: root/tdemarkdown/md4c/scripts/build_folding_map.py
diff options
context:
space:
mode:
Diffstat (limited to 'tdemarkdown/md4c/scripts/build_folding_map.py')
-rw-r--r--tdemarkdown/md4c/scripts/build_folding_map.py120
1 files changed, 120 insertions, 0 deletions
diff --git a/tdemarkdown/md4c/scripts/build_folding_map.py b/tdemarkdown/md4c/scripts/build_folding_map.py
new file mode 100644
index 000000000..b401775f5
--- /dev/null
+++ b/tdemarkdown/md4c/scripts/build_folding_map.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import textwrap
+
+
+self_path = os.path.dirname(os.path.realpath(__file__));
+f = open(self_path + "/unicode/CaseFolding.txt", "r")
+
+status_list = [ "C", "F" ]
+
+folding_list = [ dict(), dict(), dict() ]
+
+# Filter the foldings for "full" folding.
+for line in f:
+ comment_off = line.find("#")
+ if comment_off >= 0:
+ line = line[:comment_off]
+ line = line.strip()
+ if not line:
+ continue
+
+ raw_codepoint, status, raw_mapping, ignored_tail = line.split(";", 3)
+ if not status.strip() in status_list:
+ continue
+ codepoint = int(raw_codepoint.strip(), 16)
+ mapping = [int(it, 16) for it in raw_mapping.strip().split(" ")]
+ mapping_len = len(mapping)
+
+ if mapping_len in range(1, 4):
+ folding_list[mapping_len-1][codepoint] = mapping
+ else:
+ assert(False)
+f.close()
+
+
+# If we assume that (index0 ... index-1) makes a range (as defined below),
+# check that the newly provided index is compatible with the range too; i.e.
+# verify that the range can be extended without breaking its properties.
+#
+# Currently, we can handle ranges which:
+#
+# (1) either form consecutive sequence of codepoints and which map that range
+# to other consecutive range of codepoints (of the same length);
+#
+# (2) or a consecutive sequence of codepoints with step 2 where each codepoint
+# CP is mapped to the codepoint CP+1
+# (e.g. 0x1234 -> 0x1235; 0x1236 -> 0x1237; 0x1238 -> 0x1239; ...).
+#
+# Note: When the codepoints in the range are mapped to multiple codepoints,
+# only the 1st mapped codepoint is considered. All the other ones have to be
+# shared by all the mappings covered by the range.
+def is_range_compatible(folding, codepoint_list, index0, index):
+ N = index - index0
+ codepoint0 = codepoint_list[index0]
+ codepoint1 = codepoint_list[index0+1]
+ codepointN = codepoint_list[index]
+ mapping0 = folding[codepoint0]
+ mapping1 = folding[codepoint1]
+ mappingN = folding[codepointN]
+
+ # Check the range type (1):
+ if codepoint1 - codepoint0 == 1 and codepointN - codepoint0 == N \
+ and mapping1[0] - mapping0[0] == 1 and mapping1[1:] == mapping0[1:] \
+ and mappingN[0] - mapping0[0] == N and mappingN[1:] == mapping0[1:]:
+ return True
+
+ # Check the range type (2):
+ if codepoint1 - codepoint0 == 2 and codepointN - codepoint0 == 2 * N \
+ and mapping0[0] - codepoint0 == 1 \
+ and mapping1[0] - codepoint1 == 1 and mapping1[1:] == mapping0[1:] \
+ and mappingN[0] - codepointN == 1 and mappingN[1:] == mapping0[1:]:
+ return True
+
+ return False
+
+
+def mapping_str(list, mapping):
+ return ",".join("0x{:04x}".format(x) for x in mapping)
+
+for mapping_len in range(1, 4):
+ folding = folding_list[mapping_len-1]
+ codepoint_list = list(folding)
+
+ index0 = 0
+ count = len(folding)
+
+ records = list()
+ data_records = list()
+
+ while index0 < count:
+ index1 = index0 + 1
+ while index1 < count and is_range_compatible(folding, codepoint_list, index0, index1):
+ index1 += 1
+
+ if index1 - index0 > 2:
+ # Range of codepoints
+ records.append("R(0x{:04x},0x{:04x})".format(codepoint_list[index0], codepoint_list[index1-1]))
+ data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
+ data_records.append(mapping_str(data_records, folding[codepoint_list[index1-1]]))
+ index0 = index1
+ else:
+ # Single codepoint
+ records.append("S(0x{:04x})".format(codepoint_list[index0]))
+ data_records.append(mapping_str(data_records, folding[codepoint_list[index0]]))
+ index0 += 1
+
+ sys.stdout.write("static const unsigned FOLD_MAP_{}[] = {{\n".format(mapping_len))
+ sys.stdout.write("\n".join(textwrap.wrap(", ".join(records), 110,
+ initial_indent = " ", subsequent_indent=" ")))
+ sys.stdout.write("\n};\n")
+
+ sys.stdout.write("static const unsigned FOLD_MAP_{}_DATA[] = {{\n".format(mapping_len))
+ sys.stdout.write("\n".join(textwrap.wrap(", ".join(data_records), 110,
+ initial_indent = " ", subsequent_indent=" ")))
+ sys.stdout.write("\n};\n")
+
+
+