-
Notifications
You must be signed in to change notification settings - Fork 0
/
wmdatenauswerten.py
43 lines (31 loc) · 1.37 KB
/
wmdatenauswerten.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
import pyspark.sql.functions as func
import os
import sys
from pyspark.sql import SparkSession
import pyspark
#mit open ssh kopieren den key,
#neues public key erzeugnen
#.ssh authorized keys anhängen den neuen key
# environment
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
#sc = pyspark.SparkContext
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
#path
pathsample_1 = "C:/Users/ArtzT/Documents/WordPad Dokumente/Universität/INHALT STUDIENGANG INFORMATIK/6 Semester/SOES/Übung/uebung07/input/*.json"
pathsample_2 = "C:/Users/ArtzT/Documents/Visual Studio Code/Forschungsmodul/jsons/jsons/sample_2014-06-12-18-58-38.348+0200.json"
#Laender/Timezones die am meisten mitgefiebert haben
"""
laender = spark.read.json(pathsample_2)
laender.createTempView("Zeitzonen")
laender_ausgabe = spark.sql("SELECT user.time_zone, count(user.time_zone) FROM Zeitzonen GROUP BY user.time_zone ORDER BY count(user.time_zone) DESC")
laender_ausgabe.show(20, False)
"""
#Schirierwahnungen
referee_tweets = spark.read.json(pathsample_1)
referee_tweets.createTempView("Schiedsrichtererwaehnungen")
referee = spark.sql("SELECT user.name, retweeted_status.text FROM Schiedsrichtererwaehnungen WHERE retweeted_status.text LIKE '%referee%'")
referee.show(200, False)