Rubén Rincón | 3d9c0a9 | 2017-03-28 14:45:38 +0200 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- |
| 2 | import os |
| 3 | import argparse |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 4 | import re |
| 5 | import json |
| 6 | |
| 7 | try: |
| 8 | from bs4 import BeautifulSoup |
| 9 | except: |
| 10 | raise "Please install BeautifulSoup (apt-get install python-bs4 should do it)" |
Rubén Rincón | 3d9c0a9 | 2017-03-28 14:45:38 +0200 | [diff] [blame] | 11 | |
| 12 | parser = argparse.ArgumentParser(description='Docenizes HTML version of the official Intel Asm PDFs') |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 13 | parser.add_argument('-i', '--inputfolder', type=str, |
| 14 | help='Folder where the input files reside as .html. Default is current folder', default='./') |
| 15 | parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .js file. Default is ./asm-docs.js', |
| 16 | default='./asm-docs.js') |
Rubén Rincón | 3d9c0a9 | 2017-03-28 14:45:38 +0200 | [diff] [blame] | 17 | |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 18 | # The maximum number of paragraphs from the description to copy. |
| 19 | MAX_DESC_PARAS = 5 |
| 20 | STRIP_PREFIX = re.compile(r'^(([0-9a-fA-F]{2}|(REX|VEX\.)[.0-9A-Z]*|/.|[a-z]+)\b\s*)*') |
| 21 | INSTRUCTION_RE = re.compile(r'^([A-Z][A-Z0-9]+)\*?(\s+|$)') |
| 22 | # Some instructions are so broken we just take their naes from the filename |
| 23 | UNPARSEABLE_INSTR_NAMES = ['PSRLW:PSRLD:PSRLQ', 'PSLLW:PSLLD:PSLLQ'] |
| 24 | # Some instructions are defined in multiple files. We ignore a specific set of the |
| 25 | # duplicates here. |
| 26 | IGNORED_DUPLICATES = [ |
| 27 | 'MOV-1', # move to control reg |
| 28 | 'MOV-2', # move to debug reg |
| 29 | 'CMPSD', # compare doubleword (defined in CMPS:CMPSB:CMPSW:CMPSD:CMPSQ) |
| 30 | 'MOVQ', # defined in MOVD:MOVQ |
| 31 | 'MOVSD' # defined in MOVS:MOVSB:MOVSW:MOVSD:MOVSQ |
| 32 | ] |
| 33 | |
| 34 | |
| 35 | class Instruction(object): |
| 36 | def __init__(self, name, names, tooltip, body): |
| 37 | self.name = name |
| 38 | self.names = names |
Rubén | 895b66d | 2017-10-06 20:13:04 +0200 | [diff] [blame] | 39 | self.tooltip = tooltip.rstrip(': ,') |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 40 | self.body = body |
| 41 | |
| 42 | def __str__(self): |
| 43 | return "{} = {}\n{}".format(self.names, self.tooltip, self.body) |
| 44 | |
| 45 | |
| 46 | def strip_non_instr(i): |
| 47 | # removes junk from encodings where the opcode is in the middle |
| 48 | # of prefix stuff. e.g. |
| 49 | # 66 0f 38 30 /r PMOVZXBW xmm1, xmm2/m64 |
| 50 | return STRIP_PREFIX.sub('', i) |
| 51 | |
| 52 | |
| 53 | def instr_name(i): |
| 54 | match = INSTRUCTION_RE.match(strip_non_instr(i)) |
| 55 | if match: |
| 56 | return match.group(1) |
| 57 | |
| 58 | |
| 59 | def get_description(section): |
| 60 | for sub in section: |
| 61 | descr = sub.get_text().strip() |
| 62 | if len(descr) > 20: |
| 63 | return descr |
| 64 | raise RuntimeError("Couldn't find decent description in {}".format(section)) |
| 65 | |
| 66 | def parse(name, f): |
| 67 | doc = BeautifulSoup(f, 'html.parser') |
| 68 | table = read_table(doc.table) |
| 69 | names = set() |
| 70 | |
| 71 | def add_all(instrs): |
| 72 | for i in instrs: |
| 73 | name = instr_name(i) |
| 74 | if name: names.add(name) |
| 75 | |
| 76 | for inst in table: |
| 77 | if 'Opcode/Instruction' in inst: |
| 78 | add_all(inst['Opcode/Instruction'].split("\n")) |
| 79 | elif 'Opcode*/Instruction' in inst: |
| 80 | add_all(inst['Opcode*/Instruction'].split("\n")) |
| 81 | else: |
| 82 | name = instr_name(inst['Instruction']) |
| 83 | if not name: |
| 84 | print "Unable to get instruction from:", inst['Instruction'] |
| 85 | else: |
| 86 | names.add(name) |
| 87 | if not names: |
| 88 | if name in UNPARSEABLE_INSTR_NAMES: |
| 89 | for inst in name.split(":"): |
| 90 | names.add(inst) |
| 91 | else: |
| 92 | return None |
| 93 | sections = {} |
| 94 | for section_header in doc.find_all("h2"): |
| 95 | children = [] |
| 96 | first = section_header.next_sibling |
| 97 | while first and first.name != 'h2': |
| 98 | if str(first).strip(): |
| 99 | children.append(first) |
| 100 | first = first.next_sibling |
| 101 | sections[section_header.text] = children |
| 102 | return Instruction( |
| 103 | name, |
| 104 | names, |
| 105 | get_description(sections['Description']), |
| 106 | "".join(str(x) for x in sections['Description'][:MAX_DESC_PARAS]).strip()) |
| 107 | |
| 108 | |
| 109 | def read_table(table): |
| 110 | headers = [h.get_text() for h in table.find_all('th')] |
| 111 | result = [] |
| 112 | if headers: |
| 113 | # common case |
| 114 | for row in table.find_all('tr'): |
| 115 | obj = {} |
| 116 | for column, name in zip(row.find_all('td'), headers): |
| 117 | obj[name] = column.get_text() |
| 118 | if obj: |
| 119 | result.append(obj) |
| 120 | else: |
| 121 | # Cases like BEXTR and BZHI |
| 122 | rows = table.find_all('tr') |
| 123 | if len(rows) != 1: |
| 124 | return [] |
| 125 | obj = {} |
| 126 | for td in rows[0].find_all('td'): |
| 127 | header = td.p.strong.get_text() |
| 128 | td.p.strong.decompose() |
| 129 | obj[header] = td.get_text() |
| 130 | result.append(obj) |
| 131 | |
| 132 | return result |
| 133 | |
| 134 | |
| 135 | def parse_html(directory): |
| 136 | instructions = [] |
| 137 | for root, dirs, files in os.walk(directory): |
Rubén Rincón | dc35dec | 2017-03-28 16:10:09 +0200 | [diff] [blame] | 138 | for file in files: |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 139 | if file.endswith(".html") and file != 'index.html': |
Rubén Rincón | dc35dec | 2017-03-28 16:10:09 +0200 | [diff] [blame] | 140 | with open(os.path.join(root, file)) as f2: |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 141 | name = os.path.splitext(file)[0] |
| 142 | if name in IGNORED_DUPLICATES: |
| 143 | continue |
| 144 | instruction = parse(name, f2) |
| 145 | if not instruction: |
| 146 | print "Unable to get instructions for " + file |
| 147 | continue |
| 148 | instructions.append(instruction) |
| 149 | return instructions |
| 150 | |
| 151 | |
| 152 | if __name__ == '__main__': |
| 153 | args = parser.parse_args() |
| 154 | instructions = parse_html(args.inputfolder); |
| 155 | instructions.sort(lambda x, y: cmp(x.name, y.name)) |
| 156 | all_inst = set() |
| 157 | for inst in instructions: |
| 158 | if not all_inst.isdisjoint(inst.names): |
| 159 | print "Overlap in instruction names: {} for {}".format( |
| 160 | inst.names.intersection(all_inst), inst.name) |
| 161 | all_inst = all_inst.union(inst.names) |
| 162 | |
| 163 | with open(args.outputpath, 'w') as f: |
| 164 | f.write(""" |
| 165 | function getAsmOpcode(opcode) { |
| 166 | if (!opcode) return; |
| 167 | switch (opcode.toUpperCase()) { |
| 168 | """) |
| 169 | for inst in instructions: |
| 170 | for name in inst.names: |
| 171 | f.write(' case "{}":\n'.format(name)) |
Matt Godbolt | d0391b2 | 2017-04-08 11:14:33 -0500 | [diff] [blame] | 172 | f.write(' return {};\n\n'.format(json.dumps({ |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 173 | "tooltip": inst.tooltip, |
| 174 | "html": inst.body, |
| 175 | "url": "http://www.felixcloutier.com/x86/{}.html".format(inst.name) |
| 176 | }))) |
| 177 | f.write(""" |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | module.exports = { |
| 182 | getAsmOpcode: getAsmOpcode |
| 183 | }; |
| 184 | """) |