#!/usr/bin/python
# Minas Gjoka
import urllib,re, sys, time, random, datetime, time, signal
def handler(signum, frame_unused):
if signum ==signal.SIGINT:
sys.exit()
signal.signal(signal.SIGINT, handler)
nextlinkRegExp = re.compile("
Next")
entryRegExp = re.compile(".*\n\
")
#nextlink = "http://127.0.0.1/test14.html"
nextlink = "http://www.facebook.com/apps/index.php?category=0"
t_init = datetime.datetime.utcnow()
unixsecs_init= time.mktime(t_init.timetuple())
allapps = open('apps-%d' % unixsecs_init,"w")
while 1:
while 1:
ok = 0
try :
ran = random.randint(0,3)
print 'Sleeping for %d before fetching %s' % (ran, nextlink)
time.sleep(ran)
sock = urllib.urlopen( nextlink)
htmlSource = sock.read()
sock.close()
ok = 1
except:
print "Error encountered: retrying"
ok = 0
if ok==1:
break
iterator = entryRegExp.finditer(htmlSource)
count = 0
for match in iterator:
url = 'www.facebook.com%s' % match.group(1)
appid = re.search('/(\d+)$',url).group(1)
name = match.group(2)
if match.group(4) == None:
author = '-'
else:
author = re.sub('<.*?>', '', match.group(4))
if match.group(6)== None:
dau = 0
perc_tot = 0
else:
dau = re.sub(',','', match.group(6))
perc_tot = re.search('(\d+)%',match.group(7)).group(1)
if match.group(9) == None:
reviews = 0
else:
reviews = match.group(9)
#print appid, url, name, author, dau, perc_tot,reviews
t = datetime.datetime.utcnow()
unixsecs= time.mktime(t.timetuple())
fout=open('%s' % appid,"a")
fout.write('%d %d %d %d\n' % (int(unixsecs),int(dau),int(perc_tot),int(reviews)) )
fout.close()
allapps.write('%s|%s|%s|%s\n' % (appid,url,name,author) )
count = count + 1
print count
#sys.exit()
try:
nextlink = nextlinkRegExp.search(htmlSource).group(1)
except :
print "Exiting"
break
allapps.close()
sys.exit(1)