#!/usr/bin/python # Minas Gjoka import urllib,re, sys, time, random, datetime, time, signal, urllib2, cookielib, httplib, gzip, glob, os, csv import Gnuplot, Gnuplot.funcutils from scipy import * from scipy import fftpack from pylab import * def calc_cdf(sorted_list): list_freq= {} for index in range(0,len(sorted_list)): list_freq[ sorted_list[index] ] = list_freq.get( sorted_list[index] , 0) + 1 elements = list_freq.keys() elements.sort( lambda x,y :cmp( x,y) ) cdf = {} ccdf = {} n_entries = 0 for index in range(0,len(elements)): n_entries = list_freq[ elements[index] ] + n_entries cdf[ elements[index] ] = float( n_entries)/len(sorted_list) set_cdf = [cdf[key] for key in elements] return [elements, set_cdf] def divide_safe(nominator,denominator): if denominator!=0: return nominator/float(denominator) else: return 0 fset = glob.glob('app-*') print "Number of files:" + str(len(fset)) popularity_day = "2008-02-14" counter = 0 numofapps_time = {} dau_time = {} total_time = {} dau_popularityday = [] perc_popularityday = [] total_popularityday = [] avg_activity = {} dau_appidlist_time = {} dau_allappspopularity = {} totalinstalls_allappspopularity = {} for name in fset: namefields = os.path.basename(name).split("-") appid = namefields[1] if counter % 500 == 0: print "Iteration %d for application %s" % (counter, appid) reader = csv.reader(open(name, "rb")) #skip first two lines reader.next() reader.next() #row[0] = Date #row[1] = Totals #row[2] = DAU for row in reader: numofapps_time[ row[0] ] = numofapps_time.get( row[0], 0) + 1 total_time[ row[0] ] = total_time.get( row[0], 0 ) + int(row[1]) dau_time[ row[0] ] = dau_time.get( row[0], 0 ) + int(row[2]) if float(row[1])!=0: avg_activity[ row[0] ] = avg_activity.get( row[0], 0 ) + int(row[2])*int(row[2])/float(row[1]) list = dau_appidlist_time.get( row[0], []) list.append( [ appid, int(row[2]) ] ) dau_appidlist_time[ row[0] ] = list list = dau_allappspopularity.get( row[0], []) list.append( int(row[2]) ) dau_allappspopularity[ row[0] ] = list list = totalinstalls_allappspopularity.get( row[0], []) list.append( int(row[1]) ) totalinstalls_allappspopularity[ row[0] ] = list if row[0] == popularity_day: dau_popularityday.append( int(row[2]) ) total_popularityday.append( int(row[1]) ) perc_popularityday.append( divide_safe( int(row[2]), float(row[1]) ) ) counter = counter + 1 # weekday = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] day_name = {} pattern = '%Y-%m-%d' day_map_epoch = {} day_map_counter = {} listofdays = numofapps_time.keys() for date in listofdays: epoch = int(time.mktime(time.strptime(date, pattern))) day_map_epoch[ date ] = epoch day_name [ date ] = weekday[time.localtime(epoch)[6]] listofdays.sort( lambda x,y :cmp(day_map_epoch[x] ,day_map_epoch[y]) ) x_data = [] y_data_napps = [] y_data_dau = [] y_data_total = [] y_data_dauovertotal = [] y_activity = [] count = 0 mostpopular_daily = {} excluded_mostpopular = [2392950137] fday = open("daymapping","w") for day in listofdays: count = count + 1 day_map_counter[ day ] = count if count > 65 and count <= 235: fday.write("%d %s %s\n" %(count,day, day_name[day] ) ) x_data.append(count) y_data_napps.append( numofapps_time[day] ) y_data_dau.append( dau_time[day] ) y_data_total.append( total_time[day] ) #print day, numofapps_time[day], dau_time[day], total_time[day] if total_time[day]>0: dauovertotal = dau_time[day]/float(total_time[day] ) else: dauovertotal = 0 y_data_dauovertotal.append( dauovertotal ) avg_activity[ day ] = avg_activity.get( day,0 ) / ( dau_time[day] ) y_activity.append( avg_activity[ day ] ) #print day, avg_activity[ day ] list = dau_appidlist_time[day ] list.sort( lambda x,y :cmp(x[1] ,y[1]), reverse=True ) print 'Day %s' % (day) rank = 1 for i in range(5): #print list[i][0], list[i][1] while (int(list[rank][0]) in excluded_mostpopular): #print 'Excluded %d' % int(list[rank][0]) rank = rank + 1 mostpopular_daily[ list[rank][0] ] = 1 rank = rank + 1 #print '' fday.close() ############################ mostpopular_apps = mostpopular_daily.keys() print len( mostpopular_apps ) gnuplot_dat = {} g = Gnuplot.Gnuplot(debug=0) s = 'plot ' for appid in mostpopular_apps: name = 'app-%s' % appid if counter % 500 == 0: print "Popular application %s" % (appid) reader = csv.reader(open(name, "rb")) #skip first two lines reader.next() reader.next() #row[0] = Date #row[1] = Totals #row[2] = DAU x = [] y = [] f_out = open("popular-%s" % appid, "w") for row in reader: if day_map_counter[ row[0] ] > 65: # after August 28 2007 f_out.write('%d %d\n' % ( day_map_counter[ row[0] ] , int(row[2]) ) ) #print day_map_counter[ row[0] ] , row[2] f_out.close() s = '%s "popular-%s" using 1:2 with linespoints, ' % (s,appid) s = s[0:len(s)-2] g('set grid') g('set logscale y') g('set pointsize 0.5') g('set key off') g.xlabel('Timeline') g.ylabel('DAU') g('set terminal postscript color enhanced "Arial" 12') g('set output "mostpopular-apps.eps"') g('%s' % s) ############################ fout = open("nofapps.dat","w") fout.write("#Time NumberofApps\n") for i in range(len(x_data)): fout.write("%d %d\n" % (x_data[i], y_data_napps[i]) ) fout.close() #--------------- fout = open("dau.dat","w") fout.write("#Time Accumulated_DAU\n") for i in range(len(x_data)): fout.write("%d %d\n" % (x_data[i], y_data_dau[i]) ) fout.close() #--------------- fout = open("totalinstalls.dat","w") fout.write("#Time Accumulated_TotalInstalls\n") for i in range(len(x_data)): fout.write("%d %d\n" % (x_data[i], y_data_total[i]) ) fout.close() #----------------- fout = open("dauovertotalinstalls.dat","w") fout.write("#Time Dauovertotalinstalls\n") for i in range(len(x_data)): fout.write("%d %f\n" % (x_data[i], y_data_dauovertotal[i]) ) fout.close() ##########Periodicity################ y_data_input = y_activity input_array = array(y_data_input) Y=fft(input_array) n=len(Y) power = abs(Y[1:(n/2)])**2 nyquist=1./2 freq=array(range(n/2))/(n/2.0)*nyquist period=1./freq fout = open("periodicity.dat","w") fout.write("#Period Power\n") for i in range(1,10): fout.write("%d %f\n" % (period[i], power[i]) ) fout.close() ############Average Activity############## fout = open("average-activity.dat","w") fout.write("#Day Value\n") for i in range(len(x_data)): fout.write("%d %f\n" % (x_data[i], y_activity[i])) fout.close() ###########Ranking and CCDF of DAU################ dau_popularityday.sort(reverse=False) [elements, cdf_dau_popular] = calc_cdf(dau_popularityday) fout = open("ccdf-dau-lastday.dat","w") fout.write("#DAU CCDF\n") for i in range(len(elements)): fout.write('%d %f\n' % (elements[i], 1-cdf_dau_popular[i] ) ) fout.close() #---------------- ranking = arange(1, len(dau_popularityday)+1) dau_popularityday.sort(reverse=True) fout = open("rank-dau-lastday.dat","w") fout.write("#RankedApp DAU\n") for i in range(len(ranking)): fout.write('%d %d\n' % (ranking[i], dau_popularityday[i] ) ) fout.close() ###########Ranking and CCDF of Total installs################ total_popularityday.sort(reverse=False) [elements, cdf_total_popular] = calc_cdf(total_popularityday) fout = open("ccdf-totalinstalls-lastday.dat","w") fout.write("#TotalInstalls CCDF\n") for i in range(len(elements)): fout.write('%d %f\n' % (elements[i], 1-cdf_total_popular[i] ) ) fout.close() #------------ ranking = arange(1, len(total_popularityday)+1) total_popularityday.sort(reverse=True) fout = open("rank-totalinstalls-lastday.dat","w") fout.write("#RankedApp TotalInstalls\n") for i in range(len(ranking)): fout.write('%d %d\n' % (ranking[i], total_popularityday[i] ) ) fout.close() ##########Ranking and CCDF of Percentage of active users################# ranking = arange(1, len(perc_popularityday)+1) perc_popularityday.sort(reverse=True) fout = open("rank-perc-lastday.dat","w") fout.write("#Ranking Perc_ActiveOverTotalInstalls\n") for i in range(len(ranking)): fout.write('%d %f\n' % (ranking[i], perc_popularityday[i] ) ) fout.close() #############Daily active users - slope over time################ count = 0 cutmin = 512 fout = open('dau-slope-powerlaw.dat',"w") for day in listofdays: count = count + 1 day_map_counter[ day ] = count if count > 65 and count < 1000: list = dau_allappspopularity[day] list = array(list) list = list [ list >= cutmin] [elements,set_cdf] = calc_cdf(list) if len(list)==0: continue elements = array(elements) set_cdf = array(set_cdf) ccdf = 1 - set_cdf elements = elements [ (ccdf!=0)] set_cdf = set_cdf [ (ccdf!=0) ] ccdf = ccdf[ (ccdf!=0)] ccdf = ccdf[ elements !=0] set_cdf = set_cdf [ elements!=0 ] elements = elements [ elements!=0] pfit_ccdf = polyfit(log10(elements),log10(ccdf),1) xfit_ccdf = logspace(log10(min(elements)),log10(max(elements)),100); yfit_ccdf = 10**polyval(pfit_ccdf,log10(xfit_ccdf)); #print day, pfit_ccdf fout.write('%d %f\n' % (count, pfit_ccdf[0])) #g = Gnuplot.Gnuplot(debug=0) #g('set grid') #g('set pointsize 2') #g('set logscale x') #g('set logscale y') #g.xlabel('Active users') #g.ylabel('CDF') #ccdfdata = Gnuplot.Data(elements, ccdf, title='ccdf', with=' points') #ccdf_fit = Gnuplot.Data(xfit_ccdf, yfit_ccdf, title='fit', with=' points') #g.plot(ccdfdata, ccdf_fit) #raw_input('\nwait') fout.close() ##############Total installs - slope over time###################3 count = 0 cutmin = 512 fout = open('totalinstalls-slope-powerlaw.dat',"w") for day in listofdays: count = count + 1 day_map_counter[ day ] = count if count > 65 and count <= 235: list = totalinstalls_allappspopularity[day] list = array(list) list = list [ list >= cutmin] [elements,set_cdf] = calc_cdf(list) if len(list)==0: continue elements = array(elements) set_cdf = array(set_cdf) ccdf = 1 - set_cdf elements = elements [ (ccdf!=0)] set_cdf = set_cdf [ (ccdf!=0) ] ccdf = ccdf[ (ccdf!=0)] ccdf = ccdf[ elements !=0] set_cdf = set_cdf [ elements!=0 ] elements = elements [ elements!=0] pfit_ccdf = polyfit(log10(elements),log10(ccdf),1) xfit_ccdf = logspace(log10(min(elements)),log10(max(elements)),100); yfit_ccdf = 10**polyval(pfit_ccdf,log10(xfit_ccdf)); fout.write('%d %f\n' % (count, pfit_ccdf[0])) fout.close()