-
Notifications
You must be signed in to change notification settings - Fork 0
/
Instagram Query Program.py
56 lines (43 loc) · 2.74 KB
/
Instagram Query Program.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python
# coding: utf-8
import os
import pandas as pd
import numpy as np
import re
# Customize the following variables to fit your queries
max_followers = 5000
min_followers = 100
head = 0
file = "replace_with_csv_file_path"
df = pd.read_csv(file, header = head)
df.columns = ["name", "bio", "username", "followers"]
df = df.loc[df["followers"] < max_followers]
df = df[df["bio"].notna()]
df = df[df["followers"] > min_followers]
# Queries for high schools, universities, and years
hs_query = 'qa|nchs|ice|ahs|south division 21’|whitney 2020|ccat|carver|ascs|bdhs|kchs|cuyahoga falls|oak hill|high school|highschool|mshs|ajhs|battle mountain|avhs|eleanor roosevelt high'
highschool = df[df['bio'].str.contains(query, flags = re.IGNORECASE)]
college_query = 'university|jhu|college|john hopkins|stanford|mit|cmu|university of maryland|goucher|Frostburg|gt|purdue|florida state|jmu|Dsu|morgan state|upenn|penn state|hopkins|ic|isu|ksu|lmu|mcla|2021🎓|niu|occ|rwu|full sail|tamu|uchicago|U of C|virginia tech|xavier|yale|rit|george mason|temple|florida tech|cal poly|C’O 19|berkeley'
university = df[df['bio'].str.contains(college_query, flags=re.IGNORECASE)]
year_query = 'c"\\"o2021|\'21|‘21|co21|c"\\"o21|c"\\"o2020|\'20|co20|‘20|c"\\"o20|c"\\"o2019|‘19|\'19|co19|c"\\"o19|c"\\"o2018|‘18|\'18|co18|c"\\"o18|c"\\"o2017|\'17|‘17|co17|c"\\"o17'
year = df[df['bio'].str.contains(year_query, flags=re.IGNORECASE)]
private_query = 'sspp'
private = df[df['bio'].str.contains(private_query, flags=re.IGNORECASE)]
all_valid_students = year.append([university,highschool,private])
all_valid_students = all_valid_students.drop_duplicates()
public_students = highschool.append(year)
delete_query = 'c"\\"o2022|‘22|22|22\'|22”|\'22|2022|co22|22\"|c"\\"o22|c"\\"o2023|23”|23\'|‘23|23|\'23|co23|2023|c"\\"o23|23\"|c"\\"o2024|‘24|24\'|24|24”|\'24|co24|2024|c"\\"o24|es|elementary school|elementary|ms|middle school'
public_students = public_students[~public_students['bio'].str.contains(delete_query, flags=re.IGNORECASE)]
public_students = public_students.drop_duplicates()
delete = df[~df.isin(all_valid_students)].dropna()
directory = 'instagram query'
file = 'output.xlsx'
if not os.path.exists(directory):
os.makedirs(directory)
path = os.path.join(directory, file)
with pd.ExcelWriter(path) as writer:
delete.to_excel(writer, sheet_name = 'delete', index = False, header = True)
private.to_excel(writer, sheet_name = 'private', index = False, header = True)
public_students.to_excel(writer, sheet_name = 'public_students', index = False, header = True)
university.to_excel(writer, sheet_name = 'university', index = False, header = True)
all_valid_students.to_excel(writer, sheet_name = 'all_valid', index = False, header = True)