#!/usr/bin/python # Minas Gjoka import urllib,re, sys, time, random, datetime, time, signal def handler(signum, frame_unused): if signum ==signal.SIGINT: sys.exit() signal.signal(signal.SIGINT, handler) nextlinkRegExp = re.compile("
  • Next
  • ") entryRegExp = re.compile("
    (.*)\n\ (
    By (.*)
    \n)?\
    .*\n\
    ((.*)daily active user[s]? \((.*)\))?\ (.*(.*) review[s]?)?\
    ") #nextlink = "http://127.0.0.1/test14.html" nextlink = "http://www.facebook.com/apps/index.php?category=0" t_init = datetime.datetime.utcnow() unixsecs_init= time.mktime(t_init.timetuple()) allapps = open('apps-%d' % unixsecs_init,"w") while 1: while 1: ok = 0 try : ran = random.randint(0,3) print 'Sleeping for %d before fetching %s' % (ran, nextlink) time.sleep(ran) sock = urllib.urlopen( nextlink) htmlSource = sock.read() sock.close() ok = 1 except: print "Error encountered: retrying" ok = 0 if ok==1: break iterator = entryRegExp.finditer(htmlSource) count = 0 for match in iterator: url = 'www.facebook.com%s' % match.group(1) appid = re.search('/(\d+)$',url).group(1) name = match.group(2) if match.group(4) == None: author = '-' else: author = re.sub('<.*?>', '', match.group(4)) if match.group(6)== None: dau = 0 perc_tot = 0 else: dau = re.sub(',','', match.group(6)) perc_tot = re.search('(\d+)%',match.group(7)).group(1) if match.group(9) == None: reviews = 0 else: reviews = match.group(9) #print appid, url, name, author, dau, perc_tot,reviews t = datetime.datetime.utcnow() unixsecs= time.mktime(t.timetuple()) fout=open('%s' % appid,"a") fout.write('%d %d %d %d\n' % (int(unixsecs),int(dau),int(perc_tot),int(reviews)) ) fout.close() allapps.write('%s|%s|%s|%s\n' % (appid,url,name,author) ) count = count + 1 print count #sys.exit() try: nextlink = nextlinkRegExp.search(htmlSource).group(1) except : print "Exiting" break allapps.close() sys.exit(1)