1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
def lines_get(f):
'''Parse a file like object, removing comments and returning a list of
lines.'''
def cut_comment(line):
first_hash = line.find('#')
if first_hash == -1:
return line
return line[:first_hash]
return [x for x in [cut_comment(x[:-1]) for x in f.readlines()] if len(x)]
def line_split(line):
'''Split a line based on a semicolon separator.'''
def normalise(word):
return word.lstrip().rstrip()
return [normalise(x) for x in line.split(';')]
def codepoints_parse(token):
'''Parse a Unicode style code-point range. Return either a single value or a
tuple of (start, end) for a range of code-points.'''
def fromHex(token):
return int(token, 16)
parts = token.split('..')
if len(parts) == 2:
return (fromHex(parts[0]), fromHex(parts[1]))
elif len(parts) == 1:
return fromHex(parts[0])
else:
raise ValueError(token)
def unicode_file_parse(input, map, default_value = None):
'''Parse a file like object, @input where the first column is a code-point
range and the second column is mapped via the given dict, @map.'''
ranges = []
tokens = [line_split(x) for x in lines_get(input)]
for line in tokens:
if len(line) == 2:
codepoints = codepoints_parse(line[0])
value = map[line[1]]
if value == default_value:
continue
if type(codepoints) == int:
codepoints = (codepoints, codepoints)
ranges.append((codepoints[0], codepoints[1], value))
else:
raise ValueError(line)
return ranges
def sort_and_merge(ranges):
'''Given a list of (start, end, value), merge elements where the ranges are
continuous and the values are the same.'''
output = []
ranges.sort()
current = None
for v in ranges:
if current is None:
current = v
continue
if current[1] + 1 == v[0] and current[2] == v[2]:
current = (current[0], v[1], v[2])
else:
output.append(current)
current = v
if current is not None:
output.append(current)
return output
|