#!/usr/bin/python # Minas Gjoka import urllib,re, sys, time, random, datetime, time, signal, urllib2, cookielib, httplib, gzip, glob, os, csv import Gnuplot, Gnuplot.funcutils from scipy import * try: name_apps_uid = sys.argv[1] name_users_appid = sys.argv[2] except: print 'Usage:', sys.argv[0], 'apps_uid users_appid' sys.exit(1) def calc_cdf(sorted_list): list_freq= {} for index in range(0,len(sorted_list)): list_freq[ sorted_list[index] ] = list_freq.get( sorted_list[index] , 0) + 1 elements = list_freq.keys() elements.sort( lambda x,y :cmp( x,y) ) cdf = {} ccdf = {} n_entries = 0 for index in range(0,len(elements)): n_entries = list_freq[ elements[index] ] + n_entries cdf[ elements[index] ] = float( n_entries)/len(sorted_list) set_cdf = [cdf[key] for key in elements] return [elements, set_cdf] apps_uid = {} users_appid = {} nofapps_appid = {} # count of the same applications nofapps_frequency = {} # frequency of number of applications per user fapps_uid = open(name_apps_uid,"r") for line in fapps_uid: fields = line.split() uid = fields[0] list = fields[1:len(fields)] apps_uid[uid] = list fapps_uid.close() print "Apps_uid loaded" fusers_appid = open(name_users_appid,"r") for line in fusers_appid: fields = line.split() appid = fields[0] list = fields[1:len(fields)] users_appid[appid] = list fusers_appid.close() print "Users_appid loaded" for uid in apps_uid.keys(): list = apps_uid.get( uid, []) nofapps_frequency[ len(list) ] = nofapps_frequency.get( len(list), 0) + 1 for appid in users_appid.keys(): list = users_appid.get( appid, []) nofapps_appid[ appid ] = len(list) print "Calculate coverage" # sort applications in decreasing order of popularity list_appids = nofapps_appid.keys() list_appids.sort( lambda x,y :cmp( nofapps_appid[x] ,nofapps_appid[y]), reverse=True ) h = {} count = 1 x_data = [] y_data = [] for appid in list_appids: listofuids = users_appid.get( appid, []) h.update( [(x,1) for x in listofuids]) x_data.append( count ) y_data.append( len(h.keys() ) ) count = count + 1 print "Generating graphs" #############Coverage##################### fcoverage = open("coverage.dat","w") n_allapps = float( len(users_appid.keys()) ) n_allusers = float( len(apps_uid.keys()) ) print n_allapps, n_allusers fcoverage.write("#RankedApps Coveredusers Perc_RankedApps(%d) Perc_Coveredusers(%d)\n" % (n_allapps, n_allusers) ) for i in range(len(x_data)): fcoverage.write("%d %d %f %f %s\n" % (x_data[i], y_data[i], x_data[i]/n_allapps, y_data[i]/n_allusers, list_appids[i]) ) fcoverage.close() ################CCDF of Total Installs####################### installs_popularity = nofapps_appid.values() installs_popularity.sort(reverse=False) [elements, cdf_installs_popular] = calc_cdf(installs_popularity) fout=open('ccdf-installs-datacrawled.dat',"w") fout.write('') for i in range(len(elements)): fout.write("%d %f\n" % (elements[i], 1-cdf_installs_popular[i])) fout.close() ranking = arange(1, len(installs_popularity)+1) installs_popularity.sort(reverse=True) fout = open("rank-installs-datacrawled.dat","w") fout.write("#RankedApp TotalInstalls\n") for i in range(len(ranking)): fout.write('%d %d\n' % (ranking[i], installs_popularity[i] ) ) fout.close() ##################Histogram for number of apps#################### freq = nofapps_frequency.keys() freq.sort(reverse=False) fout = open("apps-peruser.dat","w") fout.write("#Numofapps Instances\n") for i in freq: fout.write("%d %d\n" % (i, nofapps_frequency[i])) fout.close()