blob: 9f74d3f4605a9aa652f139dffa65d78cf3da85ec [file] [log] [blame] [raw]
Rubén Rincón3d9c0a92017-03-28 14:45:38 +02001# -*- coding: utf-8 -*-
2import os
3import argparse
Matt Godbolt983c6bd2017-04-08 06:45:02 -05004import re
5import json
6
7try:
8 from bs4 import BeautifulSoup
9except:
10 raise "Please install BeautifulSoup (apt-get install python-bs4 should do it)"
Rubén Rincón3d9c0a92017-03-28 14:45:38 +020011
12parser = argparse.ArgumentParser(description='Docenizes HTML version of the official Intel Asm PDFs')
Matt Godbolt983c6bd2017-04-08 06:45:02 -050013parser.add_argument('-i', '--inputfolder', type=str,
14 help='Folder where the input files reside as .html. Default is current folder', default='./')
15parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .js file. Default is ./asm-docs.js',
16 default='./asm-docs.js')
Rubén Rincón3d9c0a92017-03-28 14:45:38 +020017
Matt Godbolt983c6bd2017-04-08 06:45:02 -050018# The maximum number of paragraphs from the description to copy.
19MAX_DESC_PARAS = 5
20STRIP_PREFIX = re.compile(r'^(([0-9a-fA-F]{2}|(REX|VEX\.)[.0-9A-Z]*|/.|[a-z]+)\b\s*)*')
21INSTRUCTION_RE = re.compile(r'^([A-Z][A-Z0-9]+)\*?(\s+|$)')
22# Some instructions are so broken we just take their naes from the filename
23UNPARSEABLE_INSTR_NAMES = ['PSRLW:PSRLD:PSRLQ', 'PSLLW:PSLLD:PSLLQ']
24# Some instructions are defined in multiple files. We ignore a specific set of the
25# duplicates here.
26IGNORED_DUPLICATES = [
27 'MOV-1', # move to control reg
28 'MOV-2', # move to debug reg
29 'CMPSD', # compare doubleword (defined in CMPS:CMPSB:CMPSW:CMPSD:CMPSQ)
30 'MOVQ', # defined in MOVD:MOVQ
31 'MOVSD' # defined in MOVS:MOVSB:MOVSW:MOVSD:MOVSQ
32]
33
34
35class Instruction(object):
36 def __init__(self, name, names, tooltip, body):
37 self.name = name
38 self.names = names
Rubén895b66d2017-10-06 20:13:04 +020039 self.tooltip = tooltip.rstrip(': ,')
Matt Godbolt983c6bd2017-04-08 06:45:02 -050040 self.body = body
41
42 def __str__(self):
43 return "{} = {}\n{}".format(self.names, self.tooltip, self.body)
44
45
46def strip_non_instr(i):
47 # removes junk from encodings where the opcode is in the middle
48 # of prefix stuff. e.g.
49 # 66 0f 38 30 /r PMOVZXBW xmm1, xmm2/m64
50 return STRIP_PREFIX.sub('', i)
51
52
53def instr_name(i):
54 match = INSTRUCTION_RE.match(strip_non_instr(i))
55 if match:
56 return match.group(1)
57
58
59def get_description(section):
60 for sub in section:
61 descr = sub.get_text().strip()
62 if len(descr) > 20:
63 return descr
64 raise RuntimeError("Couldn't find decent description in {}".format(section))
65
66def parse(name, f):
67 doc = BeautifulSoup(f, 'html.parser')
68 table = read_table(doc.table)
69 names = set()
70
71 def add_all(instrs):
72 for i in instrs:
73 name = instr_name(i)
74 if name: names.add(name)
75
76 for inst in table:
77 if 'Opcode/Instruction' in inst:
78 add_all(inst['Opcode/Instruction'].split("\n"))
79 elif 'Opcode*/Instruction' in inst:
80 add_all(inst['Opcode*/Instruction'].split("\n"))
81 else:
82 name = instr_name(inst['Instruction'])
83 if not name:
84 print "Unable to get instruction from:", inst['Instruction']
85 else:
86 names.add(name)
87 if not names:
88 if name in UNPARSEABLE_INSTR_NAMES:
89 for inst in name.split(":"):
90 names.add(inst)
91 else:
92 return None
93 sections = {}
94 for section_header in doc.find_all("h2"):
95 children = []
96 first = section_header.next_sibling
97 while first and first.name != 'h2':
98 if str(first).strip():
99 children.append(first)
100 first = first.next_sibling
101 sections[section_header.text] = children
102 return Instruction(
103 name,
104 names,
105 get_description(sections['Description']),
106 "".join(str(x) for x in sections['Description'][:MAX_DESC_PARAS]).strip())
107
108
109def read_table(table):
110 headers = [h.get_text() for h in table.find_all('th')]
111 result = []
112 if headers:
113 # common case
114 for row in table.find_all('tr'):
115 obj = {}
116 for column, name in zip(row.find_all('td'), headers):
117 obj[name] = column.get_text()
118 if obj:
119 result.append(obj)
120 else:
121 # Cases like BEXTR and BZHI
122 rows = table.find_all('tr')
123 if len(rows) != 1:
124 return []
125 obj = {}
126 for td in rows[0].find_all('td'):
127 header = td.p.strong.get_text()
128 td.p.strong.decompose()
129 obj[header] = td.get_text()
130 result.append(obj)
131
132 return result
133
134
135def parse_html(directory):
136 instructions = []
137 for root, dirs, files in os.walk(directory):
Rubén Rincóndc35dec2017-03-28 16:10:09 +0200138 for file in files:
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500139 if file.endswith(".html") and file != 'index.html':
Rubén Rincóndc35dec2017-03-28 16:10:09 +0200140 with open(os.path.join(root, file)) as f2:
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500141 name = os.path.splitext(file)[0]
142 if name in IGNORED_DUPLICATES:
143 continue
144 instruction = parse(name, f2)
145 if not instruction:
146 print "Unable to get instructions for " + file
147 continue
148 instructions.append(instruction)
149 return instructions
150
151
152if __name__ == '__main__':
153 args = parser.parse_args()
154 instructions = parse_html(args.inputfolder);
155 instructions.sort(lambda x, y: cmp(x.name, y.name))
156 all_inst = set()
157 for inst in instructions:
158 if not all_inst.isdisjoint(inst.names):
159 print "Overlap in instruction names: {} for {}".format(
160 inst.names.intersection(all_inst), inst.name)
161 all_inst = all_inst.union(inst.names)
162
163 with open(args.outputpath, 'w') as f:
164 f.write("""
165function getAsmOpcode(opcode) {
166 if (!opcode) return;
167 switch (opcode.toUpperCase()) {
168""")
169 for inst in instructions:
170 for name in inst.names:
171 f.write(' case "{}":\n'.format(name))
Matt Godboltd0391b22017-04-08 11:14:33 -0500172 f.write(' return {};\n\n'.format(json.dumps({
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500173 "tooltip": inst.tooltip,
174 "html": inst.body,
175 "url": "http://www.felixcloutier.com/x86/{}.html".format(inst.name)
176 })))
177 f.write("""
178 }
179}
180
181module.exports = {
182 getAsmOpcode: getAsmOpcode
183};
184""")