-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
80 lines (61 loc) · 2.55 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType, IntegerType
from pyspark.sql.functions import from_unixtime, size, col, udf, explode
import datetime
filename = "wikiJSON.json"
slen = udf(lambda s: len(s), IntegerType())
def createDataFrame(filename):
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
df = spark.read.load(filename, format="json")
return df
# extract fields from array column
def init_df(df):
columns_to_drop = ['redirect', 'ns', 'revision', 'date']
df = df.withColumn("revision", explode("revision"))\
.select("*",
#col("revision")["comment"].alias("comment"),
col("revision")["contributor"]["username"].alias("author"),
col("revision")["contributor"]["id"].alias("authorID"),
col("revision")["timestamp"].alias("date"))\
.withColumn('timestamp', from_unixtime('date', 'yyyy-MM-dd HH:mm:ss').cast(TimestampType()))
#countdf = df.withColumn("revision", select('*', size('revision').alias('revision_length')))
#countdf.show()
df_res = df.drop(*columns_to_drop)
return df_res
# total number of edits per author
def numbder_of_edits_per_author(df):
print("Number of edits per author")
authors = df.groupBy("author").count().orderBy("count", ascending=False)
authors.show()
# number of all authors per each article
def number_of_authors_per_article(df):
df.groupBy("title", "author").agg({"author": "count"}).alias("count").show()
def convert_to_timestamp(date_text):
return datetime.datetime.strptime(date_text, "%Y-%m-%d %H:%M:%S")
# number of authors per timestamp
def number_of_authors_per_timestamp(df, startDate, endDate):
startDate = convert_to_timestamp(startDate)
endDate = convert_to_timestamp(endDate)
df.groupBy("author").agg({"author": "count"}).filter(col("timestamp").isin([startDate, endDate])).show()
#minHashing
df = createDataFrame(filename)
df_init = init_df(df)
df_init.show()
#number_of_authors_per_timestamp(df_init, '2001-01-21 03:00:00', '2001-06-03 23:00:00')
numbder_of_edits_per_author(df_init)
number_of_authors_per_article(df_init)
#df2 = df_init.withColumn("revision_length", slen(df.revision))
#df2.show()
'''
# get length of edit per article
print("Size of edit per article")
edit_len = df.select(size(df.revision)).collect()
print(edit_len)
# number of unique authors
#distinctAuthors = df.groupBy("author").agg(countDistinct("author"))
#distinctAuthors.show()
'''