-
Notifications
You must be signed in to change notification settings - Fork 1
/
app-ads-crawler.py
153 lines (141 loc) · 5.99 KB
/
app-ads-crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/local/bin/python3.7
"""
Mobile Spidey
Crawl app-ads.txt
import all the requests modules
"""
import requests
import pandas as pd
import sys
import csv
import re
NAME_INDEX, REL_TYPE_INDEX = 0, 2
"""
Given input of names, bundle id's and os, retrieve and interpret the app-ads.txt file for each app.
Goal is to identify whether TripleLift is listed, and if so listed properly, in the mobile equivalent
of ads.txt for app publishers.
app-ads.txt info and givens based on https://iabtechlab.com/wp-content/uploads/2019/03/app-ads.txt-v1.0-final-.pdf
"""
def get_url(bundle, os):
url = None
if os == 'ios':
store_url = 'http://itunes.apple.com/lookup?id=%s' % bundle
# Get '1.txt' from itunes store, process result to grab sellerUrl, grab ad.txt from domain/sellerUrl
try:
response = requests.get(store_url)
payload = response.json()
if payload:
results = payload['results'][0]
if results['sellerUrl']:
url = results['sellerUrl']
print(f'Huzzah! The iOS app url is {url}!\n')
except Exception as err:
print(f'Other error occurred: {err}')
elif os == 'android':
store_url = 'https://play.google.com/store/apps/details?id=%s' % bundle
try:
response = requests.get(store_url)
rawurl = re.search('appstore:developer_url.*>', response.text)
if rawurl:
rawurl = rawurl[0]
# print(rawurl[0])
url = re.search('\"http[^\"]+\"', rawurl)[0].strip('\"')
print(f'Huzzah! The Android app url is {url}!')
except Exception as err:
print(f'Other error occurred: {err}')
else:
print('Invalid OS - send \"ios\" or \"android\"\n')
return url
"""
ANDROID LOGIC - GRAB SELLERURL FROM PLAYSTORE
elif os == 'android':
#android logic
store_url = 'https://play.google.com/store/apps/details?id=%s' % bundle
try:
response = requests.get(store_url)
url = response.json()['results'][0]['sellerUrl']
# df = pandas.read_json('1.txt') # parse txt file json returned from apple store using pandas
# url = df['results'][0]['sellerUrl']
except HTTPError as http_err:
print(f'HTTP error occurred: {http_err}') # Python 3.6
except Exception as err:
print(f'Other error occurred: {err}')
else:
print('Huzzah! The app url is %s!' % url)
return url
"""
def get_app_ads_text(app_url):
if app_url[-1] == '/':
ads_url = app_url + 'app-ads.txt'
else:
ads_url = app_url + '/' + 'app-ads.txt'
listed_pub_id, tl_rel_type = None, 'no_app_ads_txt'
try:
response = requests.get(ads_url)
# check if ads txt returned, and if so turn it into a row-readable csv parser
if response.status_code == 200 and '<head>' not in response.text:
if any(term in response.text for term in ['DIRECT', 'RESELLER']):
# print(response.text + '\n')
for row in response.text.splitlines():
# print(row)
if 'triplelift' in row:
row = row.split(',')
listed_pub_id = row[1].strip(' ')
tl_rel_type = row[2].strip(' ')
# print(app_url,'triplelift',row[1],tl_rel_type)
return listed_pub_id, tl_rel_type
print('actually not listed in app-ads.txt')
tl_rel_type = 'Unlisted'
else:
print('response text looks different..')
print(response.text)
except Exception as err:
print(f'Error in adstxt getter: {err}')
return listed_pub_id, tl_rel_type
"""
Return the relationship type/status of Triplelift in the app publisher's app-ads.txt file.
triplelift.com, 3410, DIRECT, 6c33edb13117fd86 => 'DIRECT'
xad.com, 767, RESELLER, 81cbf0a75a5e0e9a => 'Unlisted'
triplelift.com, 1190, RESELLER, 81cbf0a75a5e0e9 => 'RESELLER'
"""
if __name__ == '__main__':
print('Starting mobile ads.txt crawler')
# set up csv parsers to ingest and write out results
filename = 'mobileapplist.csv'
if len(sys.argv) > 1:
if sys.argv[1][-4:-1] == 'csv':
filename = sys.argv[1]
df = pd.read_csv(filename)
# df.info()
writer = open('mobile_crawler_results.csv', 'w')
cwriter = csv.writer(writer)
headers = ['name', 'bundle_id', 'os', 'app_url', 'listed_pub_id', 'tl_rel_type']
cwriter.writerow(headers)
# # start reading in and processing app bundle tuples
for index,row in df.iterrows():
name, orig_pub_id, bundle, os = row[0], row[1], row[2], row[3]
print(name, orig_pub_id, bundle, os)
app_url, listed_pub_id, tl_rel_type = None, None, 'no_url'
# get app_url/sellerDomain
app_url = get_url(bundle, os)
if not app_url:
# url empty, tl_relationship 'no_url'
outrow = [name, orig_pub_id, bundle, os, app_url, listed_pub_id, tl_rel_type]
cwriter.writerow(outrow)
# proceed to next row, no url to use
continue
# assume no ads.txt
listed_pub_id, tl_rel_type = get_app_ads_text(app_url)
if not listed_pub_id:
# pub id empty or tl_relationship/account not in {direct,reseller}
outrow = [name, orig_pub_id, bundle, os, app_url, listed_pub_id, tl_rel_type]
cwriter.writerow(outrow)
# proceed to next row, no url to use
continue
outrow = [name, orig_pub_id, bundle, os, app_url, listed_pub_id, tl_rel_type]
print(outrow)
cwriter.writerow(outrow)
print('Woohoo! Found TL in app-ads.txt!\n')
# cwriter.writerow(name, bundle, os, app_url, tl_rel_type)
# print('App %s has bundle id \"%s\", domain url \"%s\", and ads.txt status of %s' % (name,bundle_id,app_url,tl_rel_type))
print('Finished crawling %d records')