blob: 91c78f376aaaa916b807efd247250263db028509 [file] [log] [blame] [raw]
Jeremy Overesch11e470a2021-03-12 07:40:49 -06001#! /usr/bin/env python3
Rubén Rincón3d9c0a92017-03-28 14:45:38 +02002# -*- coding: utf-8 -*-
Rubén Rincón3d9c0a92017-03-28 14:45:38 +02003import argparse
Matt Godbolt983c6bd2017-04-08 06:45:02 -05004import json
Ethirarice4f35862018-01-26 09:54:42 +01005import os
6import re
Ethiraric7cf80402018-01-31 20:26:11 +01007import sys
Ethiraric0e6f0032018-03-06 09:43:48 +01008import tarfile
Ethirarice4f35862018-01-26 09:54:42 +01009import urllib
Jeremy Overesch11e470a2021-03-12 07:40:49 -060010from urllib import request
11from urllib import parse
Matt Godbolt983c6bd2017-04-08 06:45:02 -050012
13try:
14 from bs4 import BeautifulSoup
RabsRinconb2203dd2018-01-24 10:12:31 +010015except ImportError:
Jeremy Overesch11e470a2021-03-12 07:40:49 -060016 raise ImportError("Please install BeautifulSoup (apt-get install python3-bs4 or pip install beautifulsoup4 should do it)")
Rubén Rincón3d9c0a92017-03-28 14:45:38 +020017
18parser = argparse.ArgumentParser(description='Docenizes HTML version of the official Intel Asm PDFs')
Matt Godbolt983c6bd2017-04-08 06:45:02 -050019parser.add_argument('-i', '--inputfolder', type=str,
Ethiraric7cf80402018-01-31 20:26:11 +010020 help='Folder where the input files reside as .html. Default is ./asm-docs/',
21 default='asm-docs')
Matt Godbolt983c6bd2017-04-08 06:45:02 -050022parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .js file. Default is ./asm-docs.js',
23 default='./asm-docs.js')
Ethiraric7cf80402018-01-31 20:26:11 +010024parser.add_argument('-d', '--downloadfolder', type=str,
25 help='Folder where the archive will be downloaded and extracted', default='asm-docs')
Rubén Rincón3d9c0a92017-03-28 14:45:38 +020026
Matt Godbolt983c6bd2017-04-08 06:45:02 -050027# The maximum number of paragraphs from the description to copy.
28MAX_DESC_PARAS = 5
Ethiraricc68f79f2018-03-09 17:06:25 +010029STRIP_PREFIX = re.compile(r'^(([0-9a-fA-F]{2}|m64|NP|(REX|E?VEX\.)[.0-9A-Z]*|/[0-9a-z]+|[a-z]+)\b\s*)*')
Matt Godbolt983c6bd2017-04-08 06:45:02 -050030INSTRUCTION_RE = re.compile(r'^([A-Z][A-Z0-9]+)\*?(\s+|$)')
RabsRinconb2203dd2018-01-24 10:12:31 +010031# Some instructions are so broken we just take their names from the filename
RabsRincon3593ddb2021-08-04 10:54:09 +020032UNPARSEABLE_INSTR_NAMES = ['PSRLW:PSRLD:PSRLQ', 'PSLLW:PSLLD:PSLLQ', 'MOVBE']
Ethiraricf4a04c72018-03-07 17:45:11 +010033# Some files contain instructions which cannot be parsed and which compilers are unlikely to emit
34IGNORED_FILE_NAMES = [
RabsRincon09b72022018-03-10 11:28:22 +010035 # SGX pseudo-instructions
Ethiraricf4a04c72018-03-07 17:45:11 +010036 "EADD",
37 "EACCEPT",
38 "EAUG",
39 "EACCEPTCOPY",
40 "EDECVIRTCHILD",
41 "EINCVIRTCHILD",
42 "EINIT",
43 "ELDB:ELDU:ELDBC:ELBUC",
44 "EMODPE",
45 "EMODPR",
46 "EMODT",
47 "ERDINFO",
48 "ESETCONTEXT",
49 "ETRACKC",
50 "EBLOCK",
51 "ECREATE",
52 "EDBGRD",
53 "EDBGWR",
54 "EENTER",
55 "EEXIT",
56 "EEXTEND",
57 "EGETKEY",
58 "ELDB",
59 "ELDU",
60 "ENCLS",
61 "ENCLU",
62 "EPA",
63 "EREMOVE",
64 "EREPORT",
65 "ERESUME",
66 "ETRACK",
67 "EWB",
RabsRincon09b72022018-03-10 11:28:22 +010068 # VMX instructions
Ethiraricf4a04c72018-03-07 17:45:11 +010069 "INVEPT",
70 "INVVPID",
71 "VMCALL",
72 "VMCLEAR",
73 "VMFUNC",
74 "VMLAUNCH",
75 "VMLAUNCH:VMRESUME",
76 "VMPTRLD",
77 "VMPTRST",
78 "VMREAD",
79 "VMRESUME",
80 "VMWRITE",
81 "VMXOFF",
82 "VMXON",
RabsRincon09b72022018-03-10 11:28:22 +010083 # Other instructions
Ethiraricc68f79f2018-03-09 17:06:25 +010084 "INVLPG",
85 "LAHF",
86 "RDMSR",
87 "SGDT",
RabsRincon09b72022018-03-10 11:28:22 +010088 # Unparsable instructions
89 # These instructions should be supported in the future
Ethiraricf4a04c72018-03-07 17:45:11 +010090 "MONITOR",
Ethiraricf4a04c72018-03-07 17:45:11 +010091 "MOVDQ2Q",
Ethiraricc68f79f2018-03-09 17:06:25 +010092 "MFENCE",
Ethiraricf4a04c72018-03-07 17:45:11 +010093]
Matt Godbolt983c6bd2017-04-08 06:45:02 -050094# Some instructions are defined in multiple files. We ignore a specific set of the
95# duplicates here.
96IGNORED_DUPLICATES = [
97 'MOV-1', # move to control reg
98 'MOV-2', # move to debug reg
99 'CMPSD', # compare doubleword (defined in CMPS:CMPSB:CMPSW:CMPSD:CMPSQ)
100 'MOVQ', # defined in MOVD:MOVQ
RabsRincon09b72022018-03-10 11:28:22 +0100101 'MOVSD', # defined in MOVS:MOVSB:MOVSW:MOVSD:MOVSQ
102 'VPBROADCASTB:VPBROADCASTW:VPBROADCASTD:VPBROADCASTQ', # defined in VPBROADCAST
Ethiraricc68f79f2018-03-09 17:06:25 +0100103 "VGATHERDPS:VGATHERDPD",
104 "VGATHERQPS:VGATHERQPD",
105 "VPGATHERDD:VPGATHERQD",
106 "VPGATHERDQ:VPGATHERQQ",
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500107]
Ethirarice4f35862018-01-26 09:54:42 +0100108# Where to extract the asmdoc archive.
109ASMDOC_DIR = "asm-docs"
RabsRincon09b72022018-03-10 11:28:22 +0100110ARCHIVE_URL = "http://www.felixcloutier.com/x86/x86.tbz2"
111ARCHIVE_NAME = "x86.tbz2"
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500112
113
114class Instruction(object):
115 def __init__(self, name, names, tooltip, body):
116 self.name = name
117 self.names = names
Rubén895b66d2017-10-06 20:13:04 +0200118 self.tooltip = tooltip.rstrip(': ,')
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500119 self.body = body
120
121 def __str__(self):
RabsRincon3593ddb2021-08-04 10:54:09 +0200122 return f"{self.name} = {self.tooltip}\n{self.body}"
Ethiraricd6122772018-01-26 16:51:39 +0100123
RabsRincon09b72022018-03-10 11:28:22 +0100124
Ethiraricd6122772018-01-26 16:51:39 +0100125def get_url_for_instruction(instr):
RabsRincon3593ddb2021-08-04 10:54:09 +0200126 return f"http://www.felixcloutier.com/x86/{urllib.parse.quote(instr.name)}.html"
Ethirarice4f35862018-01-26 09:54:42 +0100127
Ethiraric7cf80402018-01-31 20:26:11 +0100128
129def download_asm_doc_archive(downloadfolder):
130 if not os.path.exists(downloadfolder):
RabsRincon3593ddb2021-08-04 10:54:09 +0200131 print(f"Creating {downloadfolder} as download folder")
Ethiraric7cf80402018-01-31 20:26:11 +0100132 os.makedirs(downloadfolder)
133 elif not os.path.isdir(downloadfolder):
RabsRincon3593ddb2021-08-04 10:54:09 +0200134 print(f"Error: download folder {downloadfolder} is not a directory")
Ethiraric7cf80402018-01-31 20:26:11 +0100135 sys.exit(1)
RabsRincon09b72022018-03-10 11:28:22 +0100136 archive_name = os.path.join(downloadfolder, ARCHIVE_NAME)
Ethirarice4f35862018-01-26 09:54:42 +0100137 print("Downloading archive...")
Jeremy Overesch11e470a2021-03-12 07:40:49 -0600138 urllib.request.urlretrieve(ARCHIVE_URL, archive_name)
RabsRincon09b72022018-03-10 11:28:22 +0100139
140
141def extract_asm_doc_archive(downloadfolder, inputfolder):
Jeremy Overesch11e470a2021-03-12 07:40:49 -0600142 print("Extracting file...")
RabsRincon09b72022018-03-10 11:28:22 +0100143 if os.path.isdir(os.path.join(inputfolder, "html")):
144 for root, dirs, files in os.walk(os.path.join(inputfolder, "html")):
Ethiraricc88dec22018-01-31 20:37:07 +0100145 for file in files:
146 if os.path.splitext(file)[1] == ".html":
RabsRincon09b72022018-03-10 11:28:22 +0100147 os.remove(os.path.join(root, file))
148 tar = tarfile.open(os.path.join(downloadfolder, ARCHIVE_NAME))
149 tar.extractall(path=inputfolder)
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500150
151
152def strip_non_instr(i):
153 # removes junk from encodings where the opcode is in the middle
154 # of prefix stuff. e.g.
155 # 66 0f 38 30 /r PMOVZXBW xmm1, xmm2/m64
156 return STRIP_PREFIX.sub('', i)
157
158
159def instr_name(i):
160 match = INSTRUCTION_RE.match(strip_non_instr(i))
161 if match:
162 return match.group(1)
163
164
Ethiraric4bad4f52018-03-06 21:07:15 +0100165def get_description_paragraphs(document_soup):
RabsRincon2f9306f2019-03-04 16:21:49 +0100166 description_header_node = document_soup.find(id="description")
RabsRincon09b72022018-03-10 11:28:22 +0100167 i = 0
Ethiraric4bad4f52018-03-06 21:07:15 +0100168 description_paragraph_node = description_header_node.next_sibling.next_sibling
169 description_paragraphs = []
170 while i < MAX_DESC_PARAS and len(description_paragraph_node.text) > 20:
Ethirarice0a14ba2018-03-09 17:04:15 +0100171 if description_paragraph_node.name == "p":
172 description_paragraphs.append(description_paragraph_node)
173 i = i + 1
174 # Move two siblings forward. Next sibling is the line feed.
Ethiraricc4f1fb42018-03-08 15:04:02 +0100175 description_paragraph_node = description_paragraph_node.next_sibling.next_sibling
Ethiraric4bad4f52018-03-06 21:07:15 +0100176 return description_paragraphs
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500177
RabsRinconb2203dd2018-01-24 10:12:31 +0100178
Ethiraric3c90d6c2018-01-25 08:00:15 +0100179def parse(filename, f):
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500180 doc = BeautifulSoup(f, 'html.parser')
Ethiraric4bad4f52018-03-06 21:07:15 +0100181 if doc.table is None:
RabsRincon3593ddb2021-08-04 10:54:09 +0200182 print(f"{filename}: Failed to find table")
Ethiraric4bad4f52018-03-06 21:07:15 +0100183 return None
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500184 table = read_table(doc.table)
185 names = set()
186
187 def add_all(instrs):
188 for i in instrs:
Ethiraric3c90d6c2018-01-25 08:00:15 +0100189 instruction_name = instr_name(i)
190 if instruction_name:
191 names.add(instruction_name)
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500192
193 for inst in table:
194 if 'Opcode/Instruction' in inst:
195 add_all(inst['Opcode/Instruction'].split("\n"))
Ethiraricf2509922018-01-31 22:27:28 +0100196 elif 'OpcodeInstruction' in inst:
197 add_all(inst['OpcodeInstruction'].split("\n"))
Ethiraricc68f79f2018-03-09 17:06:25 +0100198 elif 'Opcode Instruction' in inst:
199 add_all(inst['Opcode Instruction'].split("\n"))
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500200 elif 'Opcode*/Instruction' in inst:
201 add_all(inst['Opcode*/Instruction'].split("\n"))
Ethiraric4bad4f52018-03-06 21:07:15 +0100202 elif 'Opcode / Instruction' in inst:
203 add_all(inst['Opcode / Instruction'].split("\n"))
204 elif 'Instruction' in inst:
Ethiraric3c90d6c2018-01-25 08:00:15 +0100205 instruction_name = instr_name(inst['Instruction'])
206 if not instruction_name:
RabsRincon3593ddb2021-08-04 10:54:09 +0200207 print(f"Unable to get instruction from: {inst['Instruction']}")
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500208 else:
Ethiraric3c90d6c2018-01-25 08:00:15 +0100209 names.add(instruction_name)
Ethiraricf4a04c72018-03-07 17:45:11 +0100210 # else, skip the line
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500211 if not names:
Ethiraric3c90d6c2018-01-25 08:00:15 +0100212 if filename in UNPARSEABLE_INSTR_NAMES:
213 for inst in filename.split(":"):
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500214 names.add(inst)
215 else:
RabsRincon3593ddb2021-08-04 10:54:09 +0200216 print(f"{filename}: Failed to read instruction table")
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500217 return None
Matt Godbolt657866b2021-03-28 15:57:40 -0500218
Ethiraric4bad4f52018-03-06 21:07:15 +0100219 description_paragraphs = get_description_paragraphs(doc)
Ethiraricf2509922018-01-31 22:27:28 +0100220
Austin Mortone1cd7f92020-08-02 01:16:00 -0400221 for para in description_paragraphs:
222 for link in para.find_all('a'):
223 # this urljoin will only ensure relative urls are prefixed
224 # if a url is already absolute it does nothing
Jeremy Overesch11e470a2021-03-12 07:40:49 -0600225 link['href'] = urllib.parse.urljoin('http://www.felixcloutier.com/x86/', link['href'])
Austin Mortone1cd7f92020-08-02 01:16:00 -0400226 link['target'] = '_blank'
227 link['rel'] = 'noreferrer noopener'
Austin Mortone1cd7f92020-08-02 01:16:00 -0400228
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500229 return Instruction(
Ethiraric3c90d6c2018-01-25 08:00:15 +0100230 filename,
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500231 names,
Ethiraricc4f1fb42018-03-08 15:04:02 +0100232 description_paragraphs[0].text.strip(),
233 ''.join(map(lambda x: str(x), description_paragraphs)).strip())
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500234
235
Jeremy Rifkin16b071b2022-04-23 16:04:09 -0400236def read_table(start_table):
237 # Tables on felixcloutier may be split in half, e.g. on https://www.felixcloutier.com/x86/sal:sar:shl:shr
238 # This traverses the immediate siblings of the input table
239 tables = []
240 current_node = start_table
241 while current_node:
242 if current_node.name == 'table':
243 tables.append(current_node)
244 elif current_node.name is not None: # whitespace between the tables, i.e. the \n, is a none tag
245 break
246 current_node = current_node.next_sibling
Ethiraricf2509922018-01-31 22:27:28 +0100247 # Finding all 'th' is not enough, since some headers are 'td'.
248 # Instead, walk through all children of the first 'tr', filter out those
249 # that are only whitespace, keep `get_text()` on the others.
250 headers = list(
251 map(lambda th: th.get_text(),
Jeremy Rifkin16b071b2022-04-23 16:04:09 -0400252 filter(lambda th: str(th).strip(), tables[0].tr.children)))
Ethiraricf2509922018-01-31 22:27:28 +0100253
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500254 result = []
255 if headers:
256 # common case
Jeremy Rifkin16b071b2022-04-23 16:04:09 -0400257 for table in tables:
258 for row in table.find_all('tr'):
259 obj = {}
260 for column, name in zip(row.find_all('td'), headers):
261 # Remove '\n's in names that contain it.
262 obj[name.replace('\n', '')] = column.get_text()
263 if obj:
264 result.append(obj)
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500265 else:
266 # Cases like BEXTR and BZHI
Jeremy Rifkin16b071b2022-04-23 16:04:09 -0400267 for table in tables:
268 rows = table.find_all('tr')
269 if len(rows) != 1:
270 return []
271 obj = {}
272 for td in rows[0].find_all('td'):
273 header = td.p.strong.get_text()
274 td.p.strong.decompose()
275 obj[header] = td.get_text()
276 result.append(obj)
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500277
278 return result
279
280
281def parse_html(directory):
Jeremy Overesch11e470a2021-03-12 07:40:49 -0600282 print("Parsing instructions...")
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500283 instructions = []
284 for root, dirs, files in os.walk(directory):
Rubén Rincóndc35dec2017-03-28 16:10:09 +0200285 for file in files:
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500286 if file.endswith(".html") and file != 'index.html':
Jeremy Overesch11e470a2021-03-12 07:40:49 -0600287 with open(os.path.join(root, file), encoding='utf-8') as f2:
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500288 name = os.path.splitext(file)[0]
Ethiraricf4a04c72018-03-07 17:45:11 +0100289 if name in IGNORED_DUPLICATES or name in IGNORED_FILE_NAMES:
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500290 continue
RabsRincon3593ddb2021-08-04 10:54:09 +0200291 try:
292 instruction = parse(name, f2)
293 if not instruction:
294 continue
295 patch_instruction(instruction)
296 instructions.append(instruction)
297 except Exception as e:
298 print(f"Error parsing {name}:\n{e}")
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500299 return instructions
300
301
Ethiraricee755892018-01-26 13:45:14 +0100302def self_test(instructions, directory):
Ethirarica2a2d992018-03-09 20:53:52 +0100303 # For each generated instruction, check that there is a path to a file in
304 # the documentation.
Ethiraricee755892018-01-26 13:45:14 +0100305 directory = os.path.join(directory, "html")
Ethirarica2a2d992018-03-09 20:53:52 +0100306 ok = True
Ethiraricee755892018-01-26 13:45:14 +0100307 for inst in instructions:
308 if not os.path.isfile(os.path.join(directory, inst.name + ".html")):
RabsRincon3593ddb2021-08-04 10:54:09 +0200309 print(f"Warning: {inst.name} has not file associated")
Ethirarica2a2d992018-03-09 20:53:52 +0100310 ok = False
311 return ok
Ethiraricee755892018-01-26 13:45:14 +0100312
Ethirarica2a2d992018-03-09 20:53:52 +0100313
RabsRincon3593ddb2021-08-04 10:54:09 +0200314def patch_instruction(instruction):
315 if instruction.name == "ADDSS":
316 print("\nPatching ADDSS")
317 print("REMINDER: Check if https://github.com/compiler-explorer/compiler-explorer/issues/2380 is still relevant\n")
318
319 old_body = instruction.body
320 old_tooltip = instruction.tooltip
321 instruction.body = old_body.replace("stores the double-precision", "stores the single-precision")
322 instruction.tooltip = old_tooltip.replace("stores the double-precision", "stores the single-precision")
323
324
325def main():
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500326 args = parser.parse_args()
RabsRincon3593ddb2021-08-04 10:54:09 +0200327 print(f"Called with: {args}")
RabsRincon09b72022018-03-10 11:28:22 +0100328 # If we don't have the html folder already...
329 if not os.path.isdir(os.path.join(args.inputfolder, 'html')):
330 # We don't, try with the compressed file
331 if not os.path.isfile(os.path.join(args.downloadfolder, "x86.tbz2")):
332 # We can't find that either. Download it
333 try:
334 download_asm_doc_archive(args.downloadfolder)
335 extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
336 except IOError as e:
337 print("Error when downloading archive:")
338 print(e)
339 sys.exit(1)
340 else:
341 # We have a file already downloaded
342 extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
RabsRinconb2203dd2018-01-24 10:12:31 +0100343 instructions = parse_html(args.inputfolder)
Jeremy Overesch11e470a2021-03-12 07:40:49 -0600344 instructions.sort(key=lambda b: b.name)
Ethiraricee755892018-01-26 13:45:14 +0100345 self_test(instructions, args.inputfolder)
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500346 all_inst = set()
347 for inst in instructions:
348 if not all_inst.isdisjoint(inst.names):
RabsRincon3593ddb2021-08-04 10:54:09 +0200349 print(f"Overlap in instruction names: {inst.names.intersection(all_inst)} for {inst.name}")
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500350 all_inst = all_inst.union(inst.names)
Ethirarica2a2d992018-03-09 20:53:52 +0100351 if not self_test(instructions, args.inputfolder):
RabsRincon09b72022018-03-10 11:28:22 +0100352 print("Tests do not pass. Not writing output file. Aborting.")
353 sys.exit(3)
RabsRincon3593ddb2021-08-04 10:54:09 +0200354 print(f"Writing {len(instructions)} instructions")
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500355 with open(args.outputpath, 'w') as f:
356 f.write("""
Austin Morton044dcfb2020-09-26 16:59:26 -0400357export function getAsmOpcode(opcode) {
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500358 if (!opcode) return;
359 switch (opcode.toUpperCase()) {
360""")
361 for inst in instructions:
Jeremy Rifkin16b071b2022-04-23 16:04:09 -0400362 for name in sorted(inst.names):
RabsRincon3593ddb2021-08-04 10:54:09 +0200363 f.write(f' case "{name}":\n')
Ethiraric94ed5af2018-01-25 08:01:30 +0100364 f.write(' return {}'.format(json.dumps({
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500365 "tooltip": inst.tooltip,
366 "html": inst.body,
Ethiraricd6122772018-01-26 16:51:39 +0100367 "url": get_url_for_instruction(inst)
RabsRincon3593ddb2021-08-04 10:54:09 +0200368 }, indent=16, separators=(',', ': '), sort_keys=True))[:-1] + ' };\n\n')
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500369 f.write("""
370 }
371}
Matt Godbolt983c6bd2017-04-08 06:45:02 -0500372""")
RabsRincon3593ddb2021-08-04 10:54:09 +0200373
RabsRincon09b72022018-03-10 11:28:22 +0100374
Ethirarica2a2d992018-03-09 20:53:52 +0100375if __name__ == '__main__':
RabsRincon3593ddb2021-08-04 10:54:09 +0200376 main()