Blame - etc/scripts/docenizer.py - compiler-explorer - Rivoreo Source Code Repositories

blob: 9f74d3f4605a9aa652f139dffa65d78cf3da85ec [file] [log] [blame] [raw]

Rubén Rincón	3d9c0a9	2017-03-28 14:45:38 +0200	[diff] [blame]	1	# -- coding: utf-8 --
				2	import os
				3	import argparse
Matt Godbolt	983c6bd	2017-04-08 06:45:02 -0500	[diff] [blame]	4	import re
				5	import json
				6
				7	try:
				8	from bs4 import BeautifulSoup
				9	except:
				10	raise "Please install BeautifulSoup (apt-get install python-bs4 should do it)"
Rubén Rincón	3d9c0a9	2017-03-28 14:45:38 +0200	[diff] [blame]	11
				12	parser = argparse.ArgumentParser(description='Docenizes HTML version of the official Intel Asm PDFs')
Matt Godbolt	983c6bd	2017-04-08 06:45:02 -0500	[diff] [blame]	13	parser.add_argument('-i', '--inputfolder', type=str,
				14	help='Folder where the input files reside as .html. Default is current folder', default='./')
				15	parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .js file. Default is ./asm-docs.js',
				16	default='./asm-docs.js')
Rubén Rincón	3d9c0a9	2017-03-28 14:45:38 +0200	[diff] [blame]	17
Matt Godbolt	983c6bd	2017-04-08 06:45:02 -0500	[diff] [blame]	18	# The maximum number of paragraphs from the description to copy.
				19	MAX_DESC_PARAS = 5
				20	STRIP_PREFIX = re.compile(r'^(([0-9a-fA-F]{2}\|(REX\|VEX\.)[.0-9A-Z]\|/.\|[a-z]+)\b\s)*')
				21	INSTRUCTION_RE = re.compile(r'^([A-Z][A-Z0-9]+)\*?(\s+\|$)')
				22	# Some instructions are so broken we just take their naes from the filename
				23	UNPARSEABLE_INSTR_NAMES = ['PSRLW:PSRLD:PSRLQ', 'PSLLW:PSLLD:PSLLQ']
				24	# Some instructions are defined in multiple files. We ignore a specific set of the
				25	# duplicates here.
				26	IGNORED_DUPLICATES = [
				27	'MOV-1', # move to control reg
				28	'MOV-2', # move to debug reg
				29	'CMPSD', # compare doubleword (defined in CMPS:CMPSB:CMPSW:CMPSD:CMPSQ)
				30	'MOVQ', # defined in MOVD:MOVQ
				31	'MOVSD' # defined in MOVS:MOVSB:MOVSW:MOVSD:MOVSQ
				32	]
				33
				34
				35	class Instruction(object):
				36	def __init__(self, name, names, tooltip, body):
				37	self.name = name
				38	self.names = names
Rubén	895b66d	2017-10-06 20:13:04 +0200	[diff] [blame]	39	self.tooltip = tooltip.rstrip(': ,')
Matt Godbolt	983c6bd	2017-04-08 06:45:02 -0500	[diff] [blame]	40	self.body = body
				41
				42	def __str__(self):
				43	return "{} = {}\n{}".format(self.names, self.tooltip, self.body)
				44
				45
				46	def strip_non_instr(i):
				47	# removes junk from encodings where the opcode is in the middle
				48	# of prefix stuff. e.g.
				49	# 66 0f 38 30 /r PMOVZXBW xmm1, xmm2/m64
				50	return STRIP_PREFIX.sub('', i)
				51
				52
				53	def instr_name(i):
				54	match = INSTRUCTION_RE.match(strip_non_instr(i))
				55	if match:
				56	return match.group(1)
				57
				58
				59	def get_description(section):
				60	for sub in section:
				61	descr = sub.get_text().strip()
				62	if len(descr) > 20:
				63	return descr
				64	raise RuntimeError("Couldn't find decent description in {}".format(section))
				65
				66	def parse(name, f):
				67	doc = BeautifulSoup(f, 'html.parser')
				68	table = read_table(doc.table)
				69	names = set()
				70
				71	def add_all(instrs):
				72	for i in instrs:
				73	name = instr_name(i)
				74	if name: names.add(name)
				75
				76	for inst in table:
				77	if 'Opcode/Instruction' in inst:
				78	add_all(inst['Opcode/Instruction'].split("\n"))
				79	elif 'Opcode*/Instruction' in inst:
				80	add_all(inst['Opcode*/Instruction'].split("\n"))
				81	else:
				82	name = instr_name(inst['Instruction'])
				83	if not name:
				84	print "Unable to get instruction from:", inst['Instruction']
				85	else:
				86	names.add(name)
				87	if not names:
				88	if name in UNPARSEABLE_INSTR_NAMES:
				89	for inst in name.split(":"):
				90	names.add(inst)
				91	else:
				92	return None
				93	sections = {}
				94	for section_header in doc.find_all("h2"):
				95	children = []
				96	first = section_header.next_sibling
				97	while first and first.name != 'h2':
				98	if str(first).strip():
				99	children.append(first)
				100	first = first.next_sibling
				101	sections[section_header.text] = children
				102	return Instruction(
				103	name,
				104	names,
				105	get_description(sections['Description']),
				106	"".join(str(x) for x in sections['Description'][:MAX_DESC_PARAS]).strip())
				107
				108
				109	def read_table(table):
				110	headers = [h.get_text() for h in table.find_all('th')]
				111	result = []
				112	if headers:
				113	# common case
				114	for row in table.find_all('tr'):
				115	obj = {}
				116	for column, name in zip(row.find_all('td'), headers):
				117	obj[name] = column.get_text()
				118	if obj:
				119	result.append(obj)
				120	else:
				121	# Cases like BEXTR and BZHI
				122	rows = table.find_all('tr')
				123	if len(rows) != 1:
				124	return []
				125	obj = {}
				126	for td in rows[0].find_all('td'):
				127	header = td.p.strong.get_text()
				128	td.p.strong.decompose()
				129	obj[header] = td.get_text()
				130	result.append(obj)
				131
				132	return result
				133
				134
				135	def parse_html(directory):
				136	instructions = []
				137	for root, dirs, files in os.walk(directory):
Rubén Rincón	dc35dec	2017-03-28 16:10:09 +0200	[diff] [blame]	138	for file in files:
Matt Godbolt	983c6bd	2017-04-08 06:45:02 -0500	[diff] [blame]	139	if file.endswith(".html") and file != 'index.html':
Rubén Rincón	dc35dec	2017-03-28 16:10:09 +0200	[diff] [blame]	140	with open(os.path.join(root, file)) as f2:
Matt Godbolt	983c6bd	2017-04-08 06:45:02 -0500	[diff] [blame]	141	name = os.path.splitext(file)[0]
				142	if name in IGNORED_DUPLICATES:
				143	continue
				144	instruction = parse(name, f2)
				145	if not instruction:
				146	print "Unable to get instructions for " + file
				147	continue
				148	instructions.append(instruction)
				149	return instructions
				150
				151
				152	if __name__ == '__main__':
				153	args = parser.parse_args()
				154	instructions = parse_html(args.inputfolder);
				155	instructions.sort(lambda x, y: cmp(x.name, y.name))
				156	all_inst = set()
				157	for inst in instructions:
				158	if not all_inst.isdisjoint(inst.names):
				159	print "Overlap in instruction names: {} for {}".format(
				160	inst.names.intersection(all_inst), inst.name)
				161	all_inst = all_inst.union(inst.names)
				162
				163	with open(args.outputpath, 'w') as f:
				164	f.write("""
				165	function getAsmOpcode(opcode) {
				166	if (!opcode) return;
				167	switch (opcode.toUpperCase()) {
				168	""")
				169	for inst in instructions:
				170	for name in inst.names:
				171	f.write(' case "{}":\n'.format(name))
Matt Godbolt	d0391b2	2017-04-08 11:14:33 -0500	[diff] [blame]	172	f.write(' return {};\n\n'.format(json.dumps({
Matt Godbolt	983c6bd	2017-04-08 06:45:02 -0500	[diff] [blame]	173	"tooltip": inst.tooltip,
				174	"html": inst.body,
				175	"url": "http://www.felixcloutier.com/x86/{}.html".format(inst.name)
				176	})))
				177	f.write("""
				178	}
				179	}
				180
				181	module.exports = {
				182	getAsmOpcode: getAsmOpcode
				183	};
				184	""")