add functional data scraper util
#!/usr/bin/env python
import re
from urllib2 import urlopen
from lkrtweb.lkrt.models import KernelVersion, ClientMachine, Status, TestType, FunctionalTest
def geturl(url):
# print "retrieving %s" % url
f = urlopen(url)
result = f.read( )
f.close()
return result
class StringMatcher:
def __init__(self, str):
self.cur_pos = 0
self.str = str
def search(self, reobj):
m = reobj.search(self.str, self.cur_pos)
if m != None:
self.cur_pos = m.start() + len(m.group(0))
return m
def match(self, reobj):
m = reobj.match(self.str, self.cur_pos)
if m != None:
self.cur_pos = m.start() + len(m.group(0))
return m
def parse_job_page(db_kversion, host, url):
global func_test
db_client = ClientMachine.objects.get(name=host)
# fetch the specified results page
jp = StringMatcher(geturl("http://test.kernel.org/functional/" + url))
# parse it and save the data to the DB
m = jp.search(func_test)
while m != None:
db_status = Status.objects.get(name=m.group(3))
db_type, created = TestType.objects.get_or_create(name=m.group(1))
db_ftest = FunctionalTest(id=m.group(2), type=db_type, kversion=db_kversion, client=db_client, status=db_status, reason=m.group(4))
db_ftest.save()
m = jp.search(func_test)
version = re.compile('<td>([a-zA-Z0-9.-]+)\s')
patch = re.compile('<br>\+<a href="\.\.\/(.+)">.+</a>\s')
end_version = re.compile('</td>\s')
nonlink = re.compile('<td> </td>\s')
link = re.compile('<td.+\s.+"(.+)">(\w+)</a>\s</td>\s')
func_test = re.compile('<td>(\w+)</td>\s<td><.+>(\d+)</a></td>\s<td><.+>(\w+)</a></td>\s<td><.+>(.+)</a></td>\s</tr>')
main = StringMatcher(geturl("http://test.kernel.org/functional/index.html"))
hosts = [ 'elm3b6', 'moe', 'elm3b132', 'elm3b133', 'gekko-lp1', 'pSeries-101', 'bl6-13', 'elm3b239' ]
while True:
# get the base version and create a record in the DB
m = main.search(version)
if m == None:
print "no more kernels found, exiting"
break
db_kversion = KernelVersion(base=m.group(1))
db_kversion.save()
# find the patches and add them to the kernel version (also in the DB)
while True:
m = main.match(patch)
if m != None:
url = m.group(1)
db_kversion.add_patch_at_url(url)
else:
break
m = main.search(end_version)
if m == None:
print "unexpected error: no end_version found"
break
print "Fetching results for %s" % db_kversion.printable
# now get the actual results, page by page:
# there are 8 machines, listed (in order) in the list 'hosts' above
for i in range(8):
m = main.match(nonlink)
if m == None:
m = main.match(link)
parse_job_page(db_kversion, hosts[i], m.group(1))
|
Tobias McNulty Powered by ViewCVS 1.0-dev |
ViewCVS and CVS Help |