#! /usr/bin/env python ############################################## # Patch a generatex .idx file to improve # # index formatting. # # # # Author: Scott Pakin # ############################################## import re import sys def allow_breaks(s): 'Allow line breaks in long, verbatim strings.' break_before = '[({' break_after = '-.' toks = [s[0]] for c in s[1:]: if c.isupper() and toks[-1].islower(): # Allow hyphenation between a lowercase letter and an # uppercase letter. toks.append('+\\-\\spverb+') elif toks[-1] not in break_before and c in break_before: # Allow a line break before certain characters. toks.append('+\\linebreak[0]\\spverb+') elif toks[-1] in break_after and c not in break_after: # Allow a line break after certain characters. toks.append('+\\linebreak[0]\\spverb+') toks.append(c) return ''.join(toks) # Map punctuation to names. punct2name = { '\\{': 'lcurly', '\\}': 'rcurly', '\\|': 'Vert', '|': 'vert', } # Define regular expressions to search for formatting that can be improved. magic_re = re.compile(r'^\\indexentry\{_=\\(magic[A-Za-z]+)') punct_re = re.compile(r'^\\indexentry{_=\\(?:sp)?verb\+([^+]+)\+') num_arg_re = re.compile(r'\{([0-9A-Fa-f]+)\}=') paren_spc_re = re.compile(r'\(\\(\S+)\s*\)') long_verb_re = re.compile(r'=\\spverb\+\\([^+]{25,})\+') dancers_re = re.compile(r'\(\\Pisymbol\s*\{dancers\}\{(\d+)\}\)') twemoji_flag_1_re = re.compile(r'=flag:\s+St\.\s+(.*?)\\\s+\(\\twemoji') twemoji_flag_2_re = re.compile(r'\{flag:\s+(.*?)' r'=flag:\s+(.*?)\\\s+\(\\twemoji') twemoji_flag_3_re = re.compile(r'\{(.*?)\s+flag=(.*?)\s+flag\\\s+\(\\twemoji') cs_space_re = re.compile(r'(\\[A-Za-z]+)\s+(?![A-Za-z\n])') sym_num_re = re.compile(r'^\\indexentry{([a-z][a-zA-Z]+\d)=\1(.*)$') extra_space_re = re.compile(r'\s{2,}') backslash_space_re = re.compile(r'(\w)\\\s+') curly_special_re = re.compile(r'\{([>={}!])\}') twemoji_bang_re = re.compile(r'([A-Z]{2,}!)') ########################################################################### # Read the entire input file. if len(sys.argv) < 2: sys.exit(f'Usage: {sys.argv[0]} ') with open(sys.argv[1]) as r: all = r.readlines() # Process the file line-by-line. more = [] for i in range(len(all)): # Collapse multiple spaces into one and remove unnecessarily # backslashed spaces. all[i] = extra_space_re.sub(' ', all[i]) all[i] = backslash_space_re.sub(r'\1 ', all[i]) # Escape special characters. all[i] = curly_special_re.sub(r'{!\1}', all[i]) special = all[i][12] if special in '>=|{}' and all[i][13] == '=': rest = all[i][14:] if special not in '{}': rest = rest.replace(f'{special}+', f'!{special}+') all[i] = all[i][:12] + '!' + special + '=' + rest all[i] = all[i].replace(r'verb+\|+', r'verb+\+\texttt{\char"7C}') # Replace punctuation indexed under "_" with a "_" followed by a name. match = punct_re.match(all[i]) if match is not None: try: all[i] = all[i][:13] + 'magic' + punct2name[match[1]] + all[i][13:] except KeyError: pass # Consistently index "magic" punctuation. match = magic_re.match(all[i]) if match is not None: all[i] = all[i].replace('_', '_' + match[1], 1) # Format numerical arguments (decimal or hexadecimal) to a fixed # width of 5. if "worldflag" not in all[i]: all[i] = num_arg_re.sub(lambda m: '{%s}=' % m[1].rjust(5, '0'), all[i]) # Remove trailing spaces within parentheses to canonicalize formatting. all[i] = paren_spc_re.sub(r'(\\\g<1>)', all[i]) # Remove spaces after control sequences to canonicalize formatting # and improve typesetting. all[i] = cs_space_re.sub(r'\1', all[i]) # Allow line breaks in long, control sequences typeset verbatim. all[i] = long_verb_re.sub(lambda m: ('=\\spverb+\\' + allow_breaks(m[1]) + '+'), all[i]) # Insert space after each dancers symbol so the closing parenthesis # doesn't overlap the symbol. all[i] = dancers_re.sub(r'(\\Pisymbol{dancers}{\1}\\hspace{0.5em})', all[i]) # Re-index all twemoji flags as subentries under "flags". all[i] = twemoji_flag_1_re.sub(r'=flag: St.\\ \1\\ (\\twemoji', all[i]) all[i] = twemoji_flag_2_re.sub(r'{flags>\1=\2 (\\twemoji', all[i]) if "mailbox" not in all[i]: all[i] = twemoji_flag_3_re.sub(r'{flags>\1=\2 (\\twemoji', all[i]) # Remove "keycap:" from all tweomoji entries. if '\\indexentry{keycap: ' in all[i]: all[i] = all[i].replace('keycap: ', '') # Remove numbers from numbered symbols (e.g., converting "dog2" to just # "dog". match = sym_num_re.match(all[i]) if match is not None: sym = match[1][:-1] all[i] = '\\indexentry{%s=%s%s\n' % (sym, sym, match[2]) # Double the "!" in emoji names containing that symbol such as "ON! # arrow" and "UP! button". "!" is the escape character in our # Makeindex configuration. all[i] = twemoji_bang_re.sub(r'\1!', all[i]) # Overwrite the input file with the updates. with open(sys.argv[1], 'w') as w: for ln in all + more: w.write(ln)