trumptracker.github.io/misc/scraper.py
2016-11-18 19:01:14 -05:00

42 lines
1.1 KiB
Python

# Imports
from lxml import html
import json
# Set tree variable
tree = None
# Open index.html file.
with open('./../_site/index.html') as f:
tree = html.document_fromstring(f.read())
# Grab all panels.
panels = tree.xpath('//div[@role="tabpanel"]')
output = {}
counter = 0
# For each panel, key is the id value.
for panel in panels:
output[panel.attrib['id']] = []
# Get all headers - high level ideas.
headers = panel.xpath('div/table/thead')
# Iterate through headers and add, prepare to add points.
for header in headers:
output[panel.attrib['id']].append({
"header": header.text_content().replace('\n', '').replace('\r', '').strip(),
"points": []
})
# Retrieve the next body element and append all points.
body = header.getnext()
output[panel.attrib['id']][len(output[panel.attrib['id']]) - 1]['points'].extend([x.text_content().replace('\n', '').replace('\r', '').strip() for x in body.xpath('tr/td') if len(x.getprevious().text_content()) > 0])
# Dump into a file.
with open('data.json', 'w') as f:
f.write(json.dumps(output))