Jeremy Overesch | 11e470a | 2021-03-12 07:40:49 -0600 | [diff] [blame] | 1 | #! /usr/bin/env python3 |
Rubén Rincón | 3d9c0a9 | 2017-03-28 14:45:38 +0200 | [diff] [blame] | 2 | # -*- coding: utf-8 -*- |
Rubén Rincón | 3d9c0a9 | 2017-03-28 14:45:38 +0200 | [diff] [blame] | 3 | import argparse |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 4 | import json |
Ethiraric | e4f3586 | 2018-01-26 09:54:42 +0100 | [diff] [blame] | 5 | import os |
| 6 | import re |
Ethiraric | 7cf8040 | 2018-01-31 20:26:11 +0100 | [diff] [blame] | 7 | import sys |
Ethiraric | 0e6f003 | 2018-03-06 09:43:48 +0100 | [diff] [blame] | 8 | import tarfile |
Ethiraric | e4f3586 | 2018-01-26 09:54:42 +0100 | [diff] [blame] | 9 | import urllib |
Jeremy Overesch | 11e470a | 2021-03-12 07:40:49 -0600 | [diff] [blame] | 10 | from urllib import request |
| 11 | from urllib import parse |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 12 | |
| 13 | try: |
| 14 | from bs4 import BeautifulSoup |
RabsRincon | b2203dd | 2018-01-24 10:12:31 +0100 | [diff] [blame] | 15 | except ImportError: |
Jeremy Overesch | 11e470a | 2021-03-12 07:40:49 -0600 | [diff] [blame] | 16 | raise ImportError("Please install BeautifulSoup (apt-get install python3-bs4 or pip install beautifulsoup4 should do it)") |
Rubén Rincón | 3d9c0a9 | 2017-03-28 14:45:38 +0200 | [diff] [blame] | 17 | |
| 18 | parser = argparse.ArgumentParser(description='Docenizes HTML version of the official Intel Asm PDFs') |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 19 | parser.add_argument('-i', '--inputfolder', type=str, |
Ethiraric | 7cf8040 | 2018-01-31 20:26:11 +0100 | [diff] [blame] | 20 | help='Folder where the input files reside as .html. Default is ./asm-docs/', |
| 21 | default='asm-docs') |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 22 | parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .js file. Default is ./asm-docs.js', |
| 23 | default='./asm-docs.js') |
Ethiraric | 7cf8040 | 2018-01-31 20:26:11 +0100 | [diff] [blame] | 24 | parser.add_argument('-d', '--downloadfolder', type=str, |
| 25 | help='Folder where the archive will be downloaded and extracted', default='asm-docs') |
Rubén Rincón | 3d9c0a9 | 2017-03-28 14:45:38 +0200 | [diff] [blame] | 26 | |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 27 | # The maximum number of paragraphs from the description to copy. |
| 28 | MAX_DESC_PARAS = 5 |
Ethiraric | c68f79f | 2018-03-09 17:06:25 +0100 | [diff] [blame] | 29 | STRIP_PREFIX = re.compile(r'^(([0-9a-fA-F]{2}|m64|NP|(REX|E?VEX\.)[.0-9A-Z]*|/[0-9a-z]+|[a-z]+)\b\s*)*') |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 30 | INSTRUCTION_RE = re.compile(r'^([A-Z][A-Z0-9]+)\*?(\s+|$)') |
RabsRincon | b2203dd | 2018-01-24 10:12:31 +0100 | [diff] [blame] | 31 | # Some instructions are so broken we just take their names from the filename |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 32 | UNPARSEABLE_INSTR_NAMES = ['PSRLW:PSRLD:PSRLQ', 'PSLLW:PSLLD:PSLLQ', 'MOVBE'] |
Ethiraric | f4a04c7 | 2018-03-07 17:45:11 +0100 | [diff] [blame] | 33 | # Some files contain instructions which cannot be parsed and which compilers are unlikely to emit |
| 34 | IGNORED_FILE_NAMES = [ |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 35 | # SGX pseudo-instructions |
Ethiraric | f4a04c7 | 2018-03-07 17:45:11 +0100 | [diff] [blame] | 36 | "EADD", |
| 37 | "EACCEPT", |
| 38 | "EAUG", |
| 39 | "EACCEPTCOPY", |
| 40 | "EDECVIRTCHILD", |
| 41 | "EINCVIRTCHILD", |
| 42 | "EINIT", |
| 43 | "ELDB:ELDU:ELDBC:ELBUC", |
| 44 | "EMODPE", |
| 45 | "EMODPR", |
| 46 | "EMODT", |
| 47 | "ERDINFO", |
| 48 | "ESETCONTEXT", |
| 49 | "ETRACKC", |
| 50 | "EBLOCK", |
| 51 | "ECREATE", |
| 52 | "EDBGRD", |
| 53 | "EDBGWR", |
| 54 | "EENTER", |
| 55 | "EEXIT", |
| 56 | "EEXTEND", |
| 57 | "EGETKEY", |
| 58 | "ELDB", |
| 59 | "ELDU", |
| 60 | "ENCLS", |
| 61 | "ENCLU", |
| 62 | "EPA", |
| 63 | "EREMOVE", |
| 64 | "EREPORT", |
| 65 | "ERESUME", |
| 66 | "ETRACK", |
| 67 | "EWB", |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 68 | # VMX instructions |
Ethiraric | f4a04c7 | 2018-03-07 17:45:11 +0100 | [diff] [blame] | 69 | "INVEPT", |
| 70 | "INVVPID", |
| 71 | "VMCALL", |
| 72 | "VMCLEAR", |
| 73 | "VMFUNC", |
| 74 | "VMLAUNCH", |
| 75 | "VMLAUNCH:VMRESUME", |
| 76 | "VMPTRLD", |
| 77 | "VMPTRST", |
| 78 | "VMREAD", |
| 79 | "VMRESUME", |
| 80 | "VMWRITE", |
| 81 | "VMXOFF", |
| 82 | "VMXON", |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 83 | # Other instructions |
Ethiraric | c68f79f | 2018-03-09 17:06:25 +0100 | [diff] [blame] | 84 | "INVLPG", |
| 85 | "LAHF", |
| 86 | "RDMSR", |
| 87 | "SGDT", |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 88 | # Unparsable instructions |
| 89 | # These instructions should be supported in the future |
Ethiraric | f4a04c7 | 2018-03-07 17:45:11 +0100 | [diff] [blame] | 90 | "MONITOR", |
Ethiraric | f4a04c7 | 2018-03-07 17:45:11 +0100 | [diff] [blame] | 91 | "MOVDQ2Q", |
Ethiraric | c68f79f | 2018-03-09 17:06:25 +0100 | [diff] [blame] | 92 | "MFENCE", |
Ethiraric | f4a04c7 | 2018-03-07 17:45:11 +0100 | [diff] [blame] | 93 | ] |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 94 | # Some instructions are defined in multiple files. We ignore a specific set of the |
| 95 | # duplicates here. |
| 96 | IGNORED_DUPLICATES = [ |
| 97 | 'MOV-1', # move to control reg |
| 98 | 'MOV-2', # move to debug reg |
| 99 | 'CMPSD', # compare doubleword (defined in CMPS:CMPSB:CMPSW:CMPSD:CMPSQ) |
| 100 | 'MOVQ', # defined in MOVD:MOVQ |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 101 | 'MOVSD', # defined in MOVS:MOVSB:MOVSW:MOVSD:MOVSQ |
| 102 | 'VPBROADCASTB:VPBROADCASTW:VPBROADCASTD:VPBROADCASTQ', # defined in VPBROADCAST |
Ethiraric | c68f79f | 2018-03-09 17:06:25 +0100 | [diff] [blame] | 103 | "VGATHERDPS:VGATHERDPD", |
| 104 | "VGATHERQPS:VGATHERQPD", |
| 105 | "VPGATHERDD:VPGATHERQD", |
| 106 | "VPGATHERDQ:VPGATHERQQ", |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 107 | ] |
Ethiraric | e4f3586 | 2018-01-26 09:54:42 +0100 | [diff] [blame] | 108 | # Where to extract the asmdoc archive. |
| 109 | ASMDOC_DIR = "asm-docs" |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 110 | ARCHIVE_URL = "http://www.felixcloutier.com/x86/x86.tbz2" |
| 111 | ARCHIVE_NAME = "x86.tbz2" |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 112 | |
| 113 | |
| 114 | class Instruction(object): |
| 115 | def __init__(self, name, names, tooltip, body): |
| 116 | self.name = name |
| 117 | self.names = names |
Rubén | 895b66d | 2017-10-06 20:13:04 +0200 | [diff] [blame] | 118 | self.tooltip = tooltip.rstrip(': ,') |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 119 | self.body = body |
| 120 | |
| 121 | def __str__(self): |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 122 | return f"{self.name} = {self.tooltip}\n{self.body}" |
Ethiraric | d612277 | 2018-01-26 16:51:39 +0100 | [diff] [blame] | 123 | |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 124 | |
Ethiraric | d612277 | 2018-01-26 16:51:39 +0100 | [diff] [blame] | 125 | def get_url_for_instruction(instr): |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 126 | return f"http://www.felixcloutier.com/x86/{urllib.parse.quote(instr.name)}.html" |
Ethiraric | e4f3586 | 2018-01-26 09:54:42 +0100 | [diff] [blame] | 127 | |
Ethiraric | 7cf8040 | 2018-01-31 20:26:11 +0100 | [diff] [blame] | 128 | |
| 129 | def download_asm_doc_archive(downloadfolder): |
| 130 | if not os.path.exists(downloadfolder): |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 131 | print(f"Creating {downloadfolder} as download folder") |
Ethiraric | 7cf8040 | 2018-01-31 20:26:11 +0100 | [diff] [blame] | 132 | os.makedirs(downloadfolder) |
| 133 | elif not os.path.isdir(downloadfolder): |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 134 | print(f"Error: download folder {downloadfolder} is not a directory") |
Ethiraric | 7cf8040 | 2018-01-31 20:26:11 +0100 | [diff] [blame] | 135 | sys.exit(1) |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 136 | archive_name = os.path.join(downloadfolder, ARCHIVE_NAME) |
Ethiraric | e4f3586 | 2018-01-26 09:54:42 +0100 | [diff] [blame] | 137 | print("Downloading archive...") |
Jeremy Overesch | 11e470a | 2021-03-12 07:40:49 -0600 | [diff] [blame] | 138 | urllib.request.urlretrieve(ARCHIVE_URL, archive_name) |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 139 | |
| 140 | |
| 141 | def extract_asm_doc_archive(downloadfolder, inputfolder): |
Jeremy Overesch | 11e470a | 2021-03-12 07:40:49 -0600 | [diff] [blame] | 142 | print("Extracting file...") |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 143 | if os.path.isdir(os.path.join(inputfolder, "html")): |
| 144 | for root, dirs, files in os.walk(os.path.join(inputfolder, "html")): |
Ethiraric | c88dec2 | 2018-01-31 20:37:07 +0100 | [diff] [blame] | 145 | for file in files: |
| 146 | if os.path.splitext(file)[1] == ".html": |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 147 | os.remove(os.path.join(root, file)) |
| 148 | tar = tarfile.open(os.path.join(downloadfolder, ARCHIVE_NAME)) |
| 149 | tar.extractall(path=inputfolder) |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 150 | |
| 151 | |
| 152 | def strip_non_instr(i): |
| 153 | # removes junk from encodings where the opcode is in the middle |
| 154 | # of prefix stuff. e.g. |
| 155 | # 66 0f 38 30 /r PMOVZXBW xmm1, xmm2/m64 |
| 156 | return STRIP_PREFIX.sub('', i) |
| 157 | |
| 158 | |
| 159 | def instr_name(i): |
| 160 | match = INSTRUCTION_RE.match(strip_non_instr(i)) |
| 161 | if match: |
| 162 | return match.group(1) |
| 163 | |
| 164 | |
Ethiraric | 4bad4f5 | 2018-03-06 21:07:15 +0100 | [diff] [blame] | 165 | def get_description_paragraphs(document_soup): |
RabsRincon | 2f9306f | 2019-03-04 16:21:49 +0100 | [diff] [blame] | 166 | description_header_node = document_soup.find(id="description") |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 167 | i = 0 |
Ethiraric | 4bad4f5 | 2018-03-06 21:07:15 +0100 | [diff] [blame] | 168 | description_paragraph_node = description_header_node.next_sibling.next_sibling |
| 169 | description_paragraphs = [] |
| 170 | while i < MAX_DESC_PARAS and len(description_paragraph_node.text) > 20: |
Ethiraric | e0a14ba | 2018-03-09 17:04:15 +0100 | [diff] [blame] | 171 | if description_paragraph_node.name == "p": |
| 172 | description_paragraphs.append(description_paragraph_node) |
| 173 | i = i + 1 |
| 174 | # Move two siblings forward. Next sibling is the line feed. |
Ethiraric | c4f1fb4 | 2018-03-08 15:04:02 +0100 | [diff] [blame] | 175 | description_paragraph_node = description_paragraph_node.next_sibling.next_sibling |
Ethiraric | 4bad4f5 | 2018-03-06 21:07:15 +0100 | [diff] [blame] | 176 | return description_paragraphs |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 177 | |
RabsRincon | b2203dd | 2018-01-24 10:12:31 +0100 | [diff] [blame] | 178 | |
Ethiraric | 3c90d6c | 2018-01-25 08:00:15 +0100 | [diff] [blame] | 179 | def parse(filename, f): |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 180 | doc = BeautifulSoup(f, 'html.parser') |
Ethiraric | 4bad4f5 | 2018-03-06 21:07:15 +0100 | [diff] [blame] | 181 | if doc.table is None: |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 182 | print(f"{filename}: Failed to find table") |
Ethiraric | 4bad4f5 | 2018-03-06 21:07:15 +0100 | [diff] [blame] | 183 | return None |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 184 | table = read_table(doc.table) |
| 185 | names = set() |
| 186 | |
| 187 | def add_all(instrs): |
| 188 | for i in instrs: |
Ethiraric | 3c90d6c | 2018-01-25 08:00:15 +0100 | [diff] [blame] | 189 | instruction_name = instr_name(i) |
| 190 | if instruction_name: |
| 191 | names.add(instruction_name) |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 192 | |
| 193 | for inst in table: |
| 194 | if 'Opcode/Instruction' in inst: |
| 195 | add_all(inst['Opcode/Instruction'].split("\n")) |
Ethiraric | f250992 | 2018-01-31 22:27:28 +0100 | [diff] [blame] | 196 | elif 'OpcodeInstruction' in inst: |
| 197 | add_all(inst['OpcodeInstruction'].split("\n")) |
Ethiraric | c68f79f | 2018-03-09 17:06:25 +0100 | [diff] [blame] | 198 | elif 'Opcode Instruction' in inst: |
| 199 | add_all(inst['Opcode Instruction'].split("\n")) |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 200 | elif 'Opcode*/Instruction' in inst: |
| 201 | add_all(inst['Opcode*/Instruction'].split("\n")) |
Ethiraric | 4bad4f5 | 2018-03-06 21:07:15 +0100 | [diff] [blame] | 202 | elif 'Opcode / Instruction' in inst: |
| 203 | add_all(inst['Opcode / Instruction'].split("\n")) |
| 204 | elif 'Instruction' in inst: |
Ethiraric | 3c90d6c | 2018-01-25 08:00:15 +0100 | [diff] [blame] | 205 | instruction_name = instr_name(inst['Instruction']) |
| 206 | if not instruction_name: |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 207 | print(f"Unable to get instruction from: {inst['Instruction']}") |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 208 | else: |
Ethiraric | 3c90d6c | 2018-01-25 08:00:15 +0100 | [diff] [blame] | 209 | names.add(instruction_name) |
Ethiraric | f4a04c7 | 2018-03-07 17:45:11 +0100 | [diff] [blame] | 210 | # else, skip the line |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 211 | if not names: |
Ethiraric | 3c90d6c | 2018-01-25 08:00:15 +0100 | [diff] [blame] | 212 | if filename in UNPARSEABLE_INSTR_NAMES: |
| 213 | for inst in filename.split(":"): |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 214 | names.add(inst) |
| 215 | else: |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 216 | print(f"{filename}: Failed to read instruction table") |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 217 | return None |
Matt Godbolt | 657866b | 2021-03-28 15:57:40 -0500 | [diff] [blame] | 218 | |
Ethiraric | 4bad4f5 | 2018-03-06 21:07:15 +0100 | [diff] [blame] | 219 | description_paragraphs = get_description_paragraphs(doc) |
Ethiraric | f250992 | 2018-01-31 22:27:28 +0100 | [diff] [blame] | 220 | |
Austin Morton | e1cd7f9 | 2020-08-02 01:16:00 -0400 | [diff] [blame] | 221 | for para in description_paragraphs: |
| 222 | for link in para.find_all('a'): |
| 223 | # this urljoin will only ensure relative urls are prefixed |
| 224 | # if a url is already absolute it does nothing |
Jeremy Overesch | 11e470a | 2021-03-12 07:40:49 -0600 | [diff] [blame] | 225 | link['href'] = urllib.parse.urljoin('http://www.felixcloutier.com/x86/', link['href']) |
Austin Morton | e1cd7f9 | 2020-08-02 01:16:00 -0400 | [diff] [blame] | 226 | link['target'] = '_blank' |
| 227 | link['rel'] = 'noreferrer noopener' |
Austin Morton | e1cd7f9 | 2020-08-02 01:16:00 -0400 | [diff] [blame] | 228 | |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 229 | return Instruction( |
Ethiraric | 3c90d6c | 2018-01-25 08:00:15 +0100 | [diff] [blame] | 230 | filename, |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 231 | names, |
Ethiraric | c4f1fb4 | 2018-03-08 15:04:02 +0100 | [diff] [blame] | 232 | description_paragraphs[0].text.strip(), |
| 233 | ''.join(map(lambda x: str(x), description_paragraphs)).strip()) |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 234 | |
| 235 | |
Jeremy Rifkin | 16b071b | 2022-04-23 16:04:09 -0400 | [diff] [blame] | 236 | def read_table(start_table): |
| 237 | # Tables on felixcloutier may be split in half, e.g. on https://www.felixcloutier.com/x86/sal:sar:shl:shr |
| 238 | # This traverses the immediate siblings of the input table |
| 239 | tables = [] |
| 240 | current_node = start_table |
| 241 | while current_node: |
| 242 | if current_node.name == 'table': |
| 243 | tables.append(current_node) |
| 244 | elif current_node.name is not None: # whitespace between the tables, i.e. the \n, is a none tag |
| 245 | break |
| 246 | current_node = current_node.next_sibling |
Ethiraric | f250992 | 2018-01-31 22:27:28 +0100 | [diff] [blame] | 247 | # Finding all 'th' is not enough, since some headers are 'td'. |
| 248 | # Instead, walk through all children of the first 'tr', filter out those |
| 249 | # that are only whitespace, keep `get_text()` on the others. |
| 250 | headers = list( |
| 251 | map(lambda th: th.get_text(), |
Jeremy Rifkin | 16b071b | 2022-04-23 16:04:09 -0400 | [diff] [blame] | 252 | filter(lambda th: str(th).strip(), tables[0].tr.children))) |
Ethiraric | f250992 | 2018-01-31 22:27:28 +0100 | [diff] [blame] | 253 | |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 254 | result = [] |
| 255 | if headers: |
| 256 | # common case |
Jeremy Rifkin | 16b071b | 2022-04-23 16:04:09 -0400 | [diff] [blame] | 257 | for table in tables: |
| 258 | for row in table.find_all('tr'): |
| 259 | obj = {} |
| 260 | for column, name in zip(row.find_all('td'), headers): |
| 261 | # Remove '\n's in names that contain it. |
| 262 | obj[name.replace('\n', '')] = column.get_text() |
| 263 | if obj: |
| 264 | result.append(obj) |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 265 | else: |
| 266 | # Cases like BEXTR and BZHI |
Jeremy Rifkin | 16b071b | 2022-04-23 16:04:09 -0400 | [diff] [blame] | 267 | for table in tables: |
| 268 | rows = table.find_all('tr') |
| 269 | if len(rows) != 1: |
| 270 | return [] |
| 271 | obj = {} |
| 272 | for td in rows[0].find_all('td'): |
| 273 | header = td.p.strong.get_text() |
| 274 | td.p.strong.decompose() |
| 275 | obj[header] = td.get_text() |
| 276 | result.append(obj) |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 277 | |
| 278 | return result |
| 279 | |
| 280 | |
| 281 | def parse_html(directory): |
Jeremy Overesch | 11e470a | 2021-03-12 07:40:49 -0600 | [diff] [blame] | 282 | print("Parsing instructions...") |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 283 | instructions = [] |
| 284 | for root, dirs, files in os.walk(directory): |
Rubén Rincón | dc35dec | 2017-03-28 16:10:09 +0200 | [diff] [blame] | 285 | for file in files: |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 286 | if file.endswith(".html") and file != 'index.html': |
Jeremy Overesch | 11e470a | 2021-03-12 07:40:49 -0600 | [diff] [blame] | 287 | with open(os.path.join(root, file), encoding='utf-8') as f2: |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 288 | name = os.path.splitext(file)[0] |
Ethiraric | f4a04c7 | 2018-03-07 17:45:11 +0100 | [diff] [blame] | 289 | if name in IGNORED_DUPLICATES or name in IGNORED_FILE_NAMES: |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 290 | continue |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 291 | try: |
| 292 | instruction = parse(name, f2) |
| 293 | if not instruction: |
| 294 | continue |
| 295 | patch_instruction(instruction) |
| 296 | instructions.append(instruction) |
| 297 | except Exception as e: |
| 298 | print(f"Error parsing {name}:\n{e}") |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 299 | return instructions |
| 300 | |
| 301 | |
Ethiraric | ee75589 | 2018-01-26 13:45:14 +0100 | [diff] [blame] | 302 | def self_test(instructions, directory): |
Ethiraric | a2a2d99 | 2018-03-09 20:53:52 +0100 | [diff] [blame] | 303 | # For each generated instruction, check that there is a path to a file in |
| 304 | # the documentation. |
Ethiraric | ee75589 | 2018-01-26 13:45:14 +0100 | [diff] [blame] | 305 | directory = os.path.join(directory, "html") |
Ethiraric | a2a2d99 | 2018-03-09 20:53:52 +0100 | [diff] [blame] | 306 | ok = True |
Ethiraric | ee75589 | 2018-01-26 13:45:14 +0100 | [diff] [blame] | 307 | for inst in instructions: |
| 308 | if not os.path.isfile(os.path.join(directory, inst.name + ".html")): |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 309 | print(f"Warning: {inst.name} has not file associated") |
Ethiraric | a2a2d99 | 2018-03-09 20:53:52 +0100 | [diff] [blame] | 310 | ok = False |
| 311 | return ok |
Ethiraric | ee75589 | 2018-01-26 13:45:14 +0100 | [diff] [blame] | 312 | |
Ethiraric | a2a2d99 | 2018-03-09 20:53:52 +0100 | [diff] [blame] | 313 | |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 314 | def patch_instruction(instruction): |
| 315 | if instruction.name == "ADDSS": |
| 316 | print("\nPatching ADDSS") |
| 317 | print("REMINDER: Check if https://github.com/compiler-explorer/compiler-explorer/issues/2380 is still relevant\n") |
| 318 | |
| 319 | old_body = instruction.body |
| 320 | old_tooltip = instruction.tooltip |
| 321 | instruction.body = old_body.replace("stores the double-precision", "stores the single-precision") |
| 322 | instruction.tooltip = old_tooltip.replace("stores the double-precision", "stores the single-precision") |
| 323 | |
| 324 | |
| 325 | def main(): |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 326 | args = parser.parse_args() |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 327 | print(f"Called with: {args}") |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 328 | # If we don't have the html folder already... |
| 329 | if not os.path.isdir(os.path.join(args.inputfolder, 'html')): |
| 330 | # We don't, try with the compressed file |
| 331 | if not os.path.isfile(os.path.join(args.downloadfolder, "x86.tbz2")): |
| 332 | # We can't find that either. Download it |
| 333 | try: |
| 334 | download_asm_doc_archive(args.downloadfolder) |
| 335 | extract_asm_doc_archive(args.downloadfolder, args.inputfolder) |
| 336 | except IOError as e: |
| 337 | print("Error when downloading archive:") |
| 338 | print(e) |
| 339 | sys.exit(1) |
| 340 | else: |
| 341 | # We have a file already downloaded |
| 342 | extract_asm_doc_archive(args.downloadfolder, args.inputfolder) |
RabsRincon | b2203dd | 2018-01-24 10:12:31 +0100 | [diff] [blame] | 343 | instructions = parse_html(args.inputfolder) |
Jeremy Overesch | 11e470a | 2021-03-12 07:40:49 -0600 | [diff] [blame] | 344 | instructions.sort(key=lambda b: b.name) |
Ethiraric | ee75589 | 2018-01-26 13:45:14 +0100 | [diff] [blame] | 345 | self_test(instructions, args.inputfolder) |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 346 | all_inst = set() |
| 347 | for inst in instructions: |
| 348 | if not all_inst.isdisjoint(inst.names): |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 349 | print(f"Overlap in instruction names: {inst.names.intersection(all_inst)} for {inst.name}") |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 350 | all_inst = all_inst.union(inst.names) |
Ethiraric | a2a2d99 | 2018-03-09 20:53:52 +0100 | [diff] [blame] | 351 | if not self_test(instructions, args.inputfolder): |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 352 | print("Tests do not pass. Not writing output file. Aborting.") |
| 353 | sys.exit(3) |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 354 | print(f"Writing {len(instructions)} instructions") |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 355 | with open(args.outputpath, 'w') as f: |
| 356 | f.write(""" |
Austin Morton | 044dcfb | 2020-09-26 16:59:26 -0400 | [diff] [blame] | 357 | export function getAsmOpcode(opcode) { |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 358 | if (!opcode) return; |
| 359 | switch (opcode.toUpperCase()) { |
| 360 | """) |
| 361 | for inst in instructions: |
Jeremy Rifkin | 16b071b | 2022-04-23 16:04:09 -0400 | [diff] [blame] | 362 | for name in sorted(inst.names): |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 363 | f.write(f' case "{name}":\n') |
Ethiraric | 94ed5af | 2018-01-25 08:01:30 +0100 | [diff] [blame] | 364 | f.write(' return {}'.format(json.dumps({ |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 365 | "tooltip": inst.tooltip, |
| 366 | "html": inst.body, |
Ethiraric | d612277 | 2018-01-26 16:51:39 +0100 | [diff] [blame] | 367 | "url": get_url_for_instruction(inst) |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 368 | }, indent=16, separators=(',', ': '), sort_keys=True))[:-1] + ' };\n\n') |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 369 | f.write(""" |
| 370 | } |
| 371 | } |
Matt Godbolt | 983c6bd | 2017-04-08 06:45:02 -0500 | [diff] [blame] | 372 | """) |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 373 | |
RabsRincon | 09b7202 | 2018-03-10 11:28:22 +0100 | [diff] [blame] | 374 | |
Ethiraric | a2a2d99 | 2018-03-09 20:53:52 +0100 | [diff] [blame] | 375 | if __name__ == '__main__': |
RabsRincon | 3593ddb | 2021-08-04 10:54:09 +0200 | [diff] [blame] | 376 | main() |