blob: 4f1d912e75b17bdaab65105743f0b9761067fb2c [file] [log] [blame] [raw]
#! /usr/bin/env python2
# -*- coding: utf-8 -*-
import argparse
import json
import os
import re
import shutil
import sys
import urllib
import zipfile
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError("Please install BeautifulSoup (apt-get install python-bs4 should do it)")
parser = argparse.ArgumentParser(description='Docenizes HTML version of the official Intel Asm PDFs')
parser.add_argument('-i', '--inputfolder', type=str,
help='Folder where the input files reside as .html. Default is ./asm-docs/',
default='asm-docs')
parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .js file. Default is ./asm-docs.js',
default='./asm-docs.js')
parser.add_argument('-d', '--downloadfolder', type=str,
help='Folder where the archive will be downloaded and extracted', default='asm-docs')
# The maximum number of paragraphs from the description to copy.
MAX_DESC_PARAS = 5
STRIP_PREFIX = re.compile(r'^(([0-9a-fA-F]{2}|(REX|VEX\.)[.0-9A-Z]*|/.|[a-z]+)\b\s*)*')
INSTRUCTION_RE = re.compile(r'^([A-Z][A-Z0-9]+)\*?(\s+|$)')
# Some instructions are so broken we just take their names from the filename
UNPARSEABLE_INSTR_NAMES = ['PSRLW:PSRLD:PSRLQ', 'PSLLW:PSLLD:PSLLQ']
# Some instructions are defined in multiple files. We ignore a specific set of the
# duplicates here.
IGNORED_DUPLICATES = [
'MOV-1', # move to control reg
'MOV-2', # move to debug reg
'CMPSD', # compare doubleword (defined in CMPS:CMPSB:CMPSW:CMPSD:CMPSQ)
'MOVQ', # defined in MOVD:MOVQ
'MOVSD' # defined in MOVS:MOVSB:MOVSW:MOVSD:MOVSQ
]
# Where to extract the asmdoc archive.
ASMDOC_DIR = "asm-docs"
class Instruction(object):
def __init__(self, name, names, tooltip, body):
self.name = name
self.names = names
self.tooltip = tooltip.rstrip(': ,')
self.body = body
def __str__(self):
return "{} = {}\n{}".format(self.names, self.tooltip, self.body)
def get_url_for_instruction(instr):
return "http://www.felixcloutier.com/x86/{}.html".format(urllib.quote(name))
def download_asm_doc_archive(downloadfolder):
if not os.path.exists(downloadfolder):
os.makedirs(downloadfolder)
elif not os.path.isdir(downloadfolder):
print("Error: download folder {} is not a directory".format(download))
sys.exit(1)
archive_name = os.path.join(downloadfolder, "x86.zip")
print("Downloading archive...")
urllib.urlretrieve("http://www.felixcloutier.com/x86/x86.zip", archive_name)
if os.path.isdir(os.path.join(downloadfolder, "html")):
for root, dirs, files in os.walk(os.path.join(downloadfolder, "html")):
for file in files:
if os.path.splitext(file)[1] == ".html":
os.remove(os.path.join(root, file));
zip_ref = zipfile.ZipFile(archive_name, 'r')
zip_ref.extractall(downloadfolder)
zip_ref.close()
shutil.rmtree(os.path.join(downloadfolder, "__MACOSX"));
def strip_non_instr(i):
# removes junk from encodings where the opcode is in the middle
# of prefix stuff. e.g.
# 66 0f 38 30 /r PMOVZXBW xmm1, xmm2/m64
return STRIP_PREFIX.sub('', i)
def instr_name(i):
match = INSTRUCTION_RE.match(strip_non_instr(i))
if match:
return match.group(1)
def get_section_description_pars(section):
l = []
for sub in section:
l.append(sub.get_text())
return l
def get_strong_description_pars(strong):
sibling = strong.parent
l = []
while True:
if hasattr(sibling, 'p'):
l.append(sibling.get_text())
if not hasattr(sibling, 'next_sibling'):
break;
sibling = sibling.next_sibling
return l
def get_description_from_pars(pars):
for par in pars:
descr = par.strip()
if len(descr) > 20:
return descr
raise RuntimeError("Couldn't find decent description in {}".format(section))
def parse(filename, f):
print("============ " + filename)
doc = BeautifulSoup(f, 'html.parser')
table = read_table(doc.table)
names = set()
def add_all(instrs):
for i in instrs:
instruction_name = instr_name(i)
if instruction_name:
names.add(instruction_name)
for inst in table:
print(inst)
if 'Opcode/Instruction' in inst:
add_all(inst['Opcode/Instruction'].split("\n"))
elif 'OpcodeInstruction' in inst:
add_all(inst['OpcodeInstruction'].split("\n"))
elif 'Opcode*/Instruction' in inst:
add_all(inst['Opcode*/Instruction'].split("\n"))
else:
instruction_name = instr_name(inst['Instruction'])
if not instruction_name:
print "Unable to get instruction from:", inst['Instruction']
else:
names.add(instruction_name)
if not names:
if filename in UNPARSEABLE_INSTR_NAMES:
for inst in filename.split(":"):
names.add(inst)
else:
return None
sections = {}
for section_header in doc.find_all("h2"):
children = []
first = section_header.next_sibling
while first and first.name != 'h2':
if str(first).strip():
children.append(first)
first = first.next_sibling
sections[section_header.text] = children
# If we couldn't find 'Description' in sections, this means that it's in a
# '<strong>' (MOV) tag or a '<h3>' (VCVTPS2PH) tag.
if not 'Description' in sections:
# Inspecting '<strong> tag.
for strong in doc.find_all('strong'):
if strong.get_text() == 'Description':
sections['Description'] = get_strong_description_pars(strong)
break
if not 'Description' in sections:
for section_header in doc.find_all("h3"):
children = []
first = section_header.next_sibling
while first and first.name != 'h3':
if str(first).strip():
children.append(first)
first = first.next_sibling
sections[section_header.text] = children
sections['Description'] = get_section_description_pars(sections['Description'])
else:
sections['Description'] = get_section_description_pars(sections['Description'])
return Instruction(
filename,
names,
get_description_from_pars(sections['Description']),
"".join(x for x in sections['Description'][:MAX_DESC_PARAS]).strip())
def read_table(table):
# Finding all 'th' is not enough, since some headers are 'td'.
# Instead, walk through all children of the first 'tr', filter out those
# that are only whitespace, keep `get_text()` on the others.
headers = list(
map(lambda th: th.get_text(),
filter(lambda th: unicode(th).strip(), table.tr.children)))
result = []
if headers:
# common case
for row in table.find_all('tr'):
obj = {}
for column, name in zip(row.find_all('td'), headers):
# Remove '\n's in names that contain it.
obj[name.replace('\n', '')] = column.get_text()
if obj:
result.append(obj)
else:
# Cases like BEXTR and BZHI
rows = table.find_all('tr')
if len(rows) != 1:
return []
obj = {}
for td in rows[0].find_all('td'):
header = td.p.strong.get_text()
td.p.strong.decompose()
obj[header] = td.get_text()
result.append(obj)
return result
def parse_html(directory):
instructions = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".html") and file != 'index.html':
with open(os.path.join(root, file)) as f2:
name = os.path.splitext(file)[0]
if name in IGNORED_DUPLICATES:
continue
instruction = parse(name, f2)
if not instruction:
print "Unable to get instructions for " + file
continue
instructions.append(instruction)
return instructions
def self_test(instructions, directory):
directory = os.path.join(directory, "html")
for inst in instructions:
if not os.path.isfile(os.path.join(directory, inst.name + ".html")):
print("Warning: {} has not file associated".format(inst.name))
if __name__ == '__main__':
args = parser.parse_args()
if not os.path.exists(args.inputfolder):
try:
download_asm_doc_archive(args.downloadfolder)
except IOError as e:
print("Error when downloading archive:")
print(e)
sys.exit(1)
# Don't look into the input folder, but rather where we extracted.
args.inputfolder = args.downloadfolder
elif not os.path.isdir(args.inputfolder):
print("Error: input folder {} is not a folder".format(args.inputfolder))
sys.exit(1)
instructions = parse_html(args.inputfolder)
instructions.sort(lambda x, y: cmp(x.name, y.name))
self_test(instructions, args.inputfolder)
all_inst = set()
for inst in instructions:
if not all_inst.isdisjoint(inst.names):
print "Overlap in instruction names: {} for {}".format(
inst.names.intersection(all_inst), inst.name)
all_inst = all_inst.union(inst.names)
with open(args.outputpath, 'w') as f:
f.write("""
function getAsmOpcode(opcode) {
if (!opcode) return;
switch (opcode.toUpperCase()) {
""")
for inst in instructions:
for name in inst.names:
f.write(' case "{}":\n'.format(name))
f.write(' return {}'.format(json.dumps({
"tooltip": inst.tooltip,
"html": inst.body,
"url": get_url_for_instruction(inst)
}, indent=16, separators=(',', ': ')))[:-1] + ' };\n\n')
f.write("""
}
}
module.exports = {
getAsmOpcode: getAsmOpcode
};
""")