- Simplified Oracle Scraper for magiccards.info

This commit is contained in:
Sol
2013-09-19 03:18:34 +00:00
parent 24e320237f
commit 4e90eac764
2 changed files with 43 additions and 0 deletions

1
.gitattributes vendored
View File

@@ -15295,5 +15295,6 @@ tools/formats.txt -text
tools/guilds.xlsx -text tools/guilds.xlsx -text
tools/mtg-data.txt svneol=native#text/plain tools/mtg-data.txt svneol=native#text/plain
tools/mtgdata-sets-to-forge.txt svneol=native#text/plain tools/mtgdata-sets-to-forge.txt svneol=native#text/plain
tools/oracleScraper.py -text
tools/oracleScript.py svneol=native#text/x-python tools/oracleScript.py svneol=native#text/x-python
tools/packdata.xlsx -text tools/packdata.xlsx -text

42
tools/oracleScraper.py Normal file
View File

@@ -0,0 +1,42 @@
import os
import requests
setName = 'ths'
nameStart = '<span style="font-size: 1.2em;">'
oracleStart = '<p class="ctext"><b>'
oracleEnd = '</b></p>'
def normalizeName(name):
return name.lower().replace(',','').replace("'","").replace(' ', '_')
def normalizeOracle(oracle):
return oracle.replace(u'\u2014', '-').replace(u'\u2018', "'")
r = requests.get('http://magiccards.info/query?v=spoiler&s=issue&q=++e:%s/en' % setName)
spl = r.text.split(nameStart)
spl.pop(0) # Get rid of all of the html that comes before our first card
for s in spl:
# Extract name and oracle from magiccards.info
name = s[1 + s.find(">"):s.find("</a>")]
oracle = s[len(oracleStart)+s.find(oracleStart):s.find(oracleEnd)].replace('<br><br>', '\\n')
norm = normalizeName(name)
# Open relative cardsfolder
path = os.path.join('..','res','cardsfolder', norm[0], norm+'.txt')
hasOracle = False
try:
with open(path, 'r') as f:
for line in f.readlines():
hasOracle |= line.startswith("Oracle:")
if not hasOracle:
with open(path, "a") as f:
f.write('\n')
f.write(normalizeOracle(oracle))
print '+ ', norm
else:
print '= ', norm
except:
print '? ', norm