blob: 65469816470fcb6dbad4fa2c4c7d0ddf9078f697 [file] [log] [blame] [raw]
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import json
import os
import re
import sys
import tarfile
import urllib
from urllib import request
from urllib import parse
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError("Please install BeautifulSoup (apt-get install python3-bs4 or pip install beautifulsoup4 should do it)")
parser = argparse.ArgumentParser(description='Docenizes XML version of the official ARM documents')
parser.add_argument('-i', '--inputfolder', type=str,
help='Folder where the input files reside as .xml. Default is ./asm-docs-arm/',
default='asm-docs-arm')
parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .ts file. Default is ./asm-docs-arm32.ts',
default='./asm-docs-arm32.ts')
parser.add_argument('-d', '--downloadfolder', type=str,
help='Folder where the archive will be downloaded and extracted', default='asm-docs-arm')
# The maximum number of paragraphs from the description to copy.
MAX_DESC_PARAS = 5
STRIP_SUFFIX = re.compile(r'\s*(\(.*\))?\s*--.*')
INSTRUCTION_RE = re.compile(r'^([A-Z][A-Z0-9]+)\*?(\s+|$)')
# Some instructions are so broken we just take their names from the filename
UNPARSEABLE_INSTR_NAMES = []
# Some files contain instructions which cannot be parsed and which compilers are unlikely to emit
IGNORED_FILE_NAMES = [ ]
# Some instructions are defined in multiple files. We ignore a specific set of the
# duplicates here.
IGNORED_DUPLICATES = []
# Where to get the asmdoc archive.
ARCHIVE_URL = "https://developer.arm.com/-/media/developer/products/architecture/armv8-a-architecture/2020-12/AArch32_ISA_xml_v87A-2020-12.tar.gz"
ARCHIVE_NAME = "AArch32_ISA_xml_v87A-2020-12.tar.gz"
ARCHIVE_SUBDIR = "ISA_AArch32_xml_v87A-2020-12"
class Instruction(object):
def __init__(self, name, names, tooltip, body):
self.name = name
self.names = names
self.tooltip = tooltip.rstrip(': ,')
self.body = body
def __str__(self):
return "{} = {}\n{}".format(self.names, self.tooltip, self.body)
def get_url_for_instruction(instr):
return "https://developer.arm.com/documentation/ddi0597/2020-12/Base-Instructions/"
def download_asm_doc_archive(downloadfolder):
if not os.path.exists(downloadfolder):
print("Creating {} as download folder".format(downloadfolder))
os.makedirs(downloadfolder)
elif not os.path.isdir(downloadfolder):
print("Error: download folder {} is not a directory".format(downloadfolder))
sys.exit(1)
archive_name = os.path.join(downloadfolder, ARCHIVE_NAME)
print("Downloading archive...")
urllib.request.urlretrieve(ARCHIVE_URL, archive_name)
def extract_asm_doc_archive(downloadfolder, inputfolder):
print("Extracting file...")
if os.path.isdir(os.path.join(inputfolder, ARCHIVE_SUBDIR)):
for root, dirs, files in os.walk(os.path.join(inputfolder, ARCHIVE_SUBDIR)):
for file in files:
if os.path.splitext(file)[1] == ".xml":
os.remove(os.path.join(root, file))
tar = tarfile.open(os.path.join(downloadfolder, ARCHIVE_NAME))
tar.extractall(path=inputfolder)
def instr_name(i):
match = INSTRUCTION_RE.match(strip_non_instr(i))
if match:
return match.group(1)
def get_authored_paragraphs(document_soup):
authored = document_soup.instructionsection.desc.authored
if authored is None:
return None
paragraphs = authored.find_all('para')[:5]
authored_paragraphs = []
for paragraph in paragraphs:
paragraph = paragraph.wrap(document_soup.new_tag('p'))
paragraph.para.unwrap()
authored_paragraphs.append(paragraph)
return authored_paragraphs
def parse(filename, f):
doc = BeautifulSoup(f, 'html.parser')
if doc.instructionsection is None:
print(filename + ": Failed to find instructionsection")
return None
instructionsection = doc.instructionsection
names = set()
for name in STRIP_SUFFIX.sub('',instructionsection['title']).split(','):
name = name.strip()
names.add(name)
authored_paragraphs = get_authored_paragraphs(doc)
if authored_paragraphs is None:
return None
return Instruction(
filename,
names,
authored_paragraphs[0].text.strip(),
''.join(map(lambda x: str(x), authored_paragraphs)).strip())
def parse_xml(directory):
print("Parsing instructions...")
instructions = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".xml") and file != "onebigfile.xml":
with open(os.path.join(root, file), encoding='utf-8') as f2:
name = os.path.splitext(file)[0]
if name in IGNORED_DUPLICATES or name in IGNORED_FILE_NAMES:
continue
instruction = parse(name, f2)
if not instruction:
continue
instructions.append(instruction)
return instructions
def self_test(instructions, directory):
# For each generated instruction, check that there is a path to a file in
# the documentation.
directory = os.path.join(directory, ARCHIVE_SUBDIR)
ok = True
for inst in instructions:
if not os.path.isfile(os.path.join(directory, inst.name + ".xml")):
print("Warning: {} has not file associated".format(inst.name))
ok = False
return ok
def docenizer():
args = parser.parse_args()
print("Called with: {}".format(args))
# If we don't have the html folder already...
if not os.path.isdir(os.path.join(args.inputfolder, ARCHIVE_SUBDIR)):
# We don't, try with the compressed file
if not os.path.isfile(os.path.join(args.downloadfolder, ARCHIVE_NAME)):
# We can't find that either. Download it
try:
download_asm_doc_archive(args.downloadfolder)
extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
except IOError as e:
print("Error when downloading archive:")
print(e)
sys.exit(1)
else:
# We have a file already downloaded
extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
instructions = parse_xml(os.path.join(args.inputfolder, ARCHIVE_SUBDIR))
instructions.sort(key=lambda b: b.name)
self_test(instructions, args.inputfolder)
all_inst = set()
for inst in instructions:
if not all_inst.isdisjoint(inst.names):
print("Overlap in instruction names: {} for {}".format(
inst.names.intersection(all_inst), inst.name))
all_inst = all_inst.union(inst.names)
if not self_test(instructions, args.inputfolder):
print("Tests do not pass. Not writing output file. Aborting.")
sys.exit(3)
print("Writing {} instructions".format(len(instructions)))
with open(args.outputpath, 'w') as f:
f.write("""
import {AssemblyInstructionInfo} from '../base.js';
export function getAsmOpcode(opcode: string | undefined): AssemblyInstructionInfo | undefined {
if (!opcode) return;
switch (opcode) {
""".lstrip())
for inst in instructions:
for name in sorted(inst.names):
f.write(' case "{}":\n'.format(name))
f.write(' return {}'.format(json.dumps({
"tooltip": inst.tooltip,
"html": inst.body,
"url": get_url_for_instruction(inst)
}, indent=16, separators=(',', ': ')))[:-1] + ' };\n\n')
f.write("""
}
}
""")
if __name__ == '__main__':
docenizer()