#!/usr/bin/python # Minas Gjoka import urllib,re, sys, time, random, datetime, time, signal import urllib2, cookielib, httplib, gzip from mechanize import Browser try: filename = sys.argv[1] email = sys.argv[2] password = sys.argv[3] except: print 'Usage:', sys.argv[0], 'outfilename_wuids email password' sys.exit(1) def handler(signum, frame_unused): if signum ==signal.SIGINT: sys.exit() def sleeprandom(minsecs,maxsecs): num = minsecs + random.random()*maxsecs print 'Sleeping for %f' % (num) time.sleep(num) USER_AGENT = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.1) Gecko/20071106 Iceweasel/2.0.0.5 (Debian-2.0.0.1+dfsg-1)" br = Browser() br.addheaders = [("User-agent", USER_AGENT)] #br.addheaders = [("Accept-Encoding", "gzip, identity")] #br.set_handle_gzip(True) # remove comment if you get debug output #br.set_debug_redirects(True) #br.set_debug_responses(True) #br.set_debug_http(True) br.open("https://www.facebook.com") br.select_form("loginform") br['email'] = email br['pass'] = password br['persistent'] = ["1"] response = br.submit() #print response.read() appfinderRegExp = re.compile("
") openprofileRegExp = re.compile("") closeprofileRegExp = re.compile("(.*?)") my_networks = "http://www.facebook.com/b.php/" hash_uids = {} f = open(filename, "r") for line in f: hash_uids[int(line)] = 1 f.close() init_size = len(hash_uids) print "%d uids found in file %s" % (init_size, filename) f = open(filename, "a") counter = 0 while 1: br.clear_history() while 1: ok = 0 try : sleeprandom(0,3) network = br.open(my_networks) # random selection of up to 10 people each time htmlSource = network.read() ok = 1 except: print "Error encountered: retrying" ok = 0 if ok==1: break counter = counter + 1 n_closedprofiles = len(closeprofileRegExp.findall(htmlSource)) print "Random retrieval successful: %d/%d unique UIDs in iteration %d \t\t %d closed profiles" % \ (len(hash_uids), counter*10, counter, n_closedprofiles) iterator = openprofileRegExp.finditer(htmlSource) for match in iterator: uid = int(match.group(1)) f.write('%d\n' % uid) f.flush() if uid not in hash_uids: url_profile = "http://www.facebook.com/profile.php?id=%d" % uid max_waittime = 0.25 while 1: ok = 0 try : sleeprandom(0,max_waittime) network = br.open(url_profile) # random selection of up to 10 people each time htmlSource = network.read() ok = 1 except: print "Error encountered: retrying" ok = 0 max_waittime = max_waittime*2 if ok==1: break print "Profile %d retrieved" % uid f_profile = gzip.open("profile-%d.gz" % uid,"w") f_profile.write("%s\n" % htmlSource) f_profile.close() hash_uids[uid] = 1 #iterator = appfinderRegExp.finditer(htmlSource) #for match in iterator: #print match.group(1)