-
Notifications
You must be signed in to change notification settings - Fork 0
/
readData.py
84 lines (67 loc) · 3.04 KB
/
readData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
import pyspark.sql.functions as func
import os
import sys
from pyspark.sql import SparkSession
import pyspark
#mit open ssh kopieren den key,
#neues public key erzeugnen
#.ssh authorized keys anhängen den neuen key
# environment
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
#sc = pyspark.SparkContext
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
path5_100 = "E:/Uni/SoSe21/Forschungsmodull Datenbanken/Code/rt-5-100.json"
path10_100 = "E:/Uni/SoSe21/Forschungsmodull Datenbanken/Code/rt-10-100.json"
path10 = "E:/Uni/SoSe21/Forschungsmodull Datenbanken/Code/rt-10.json"
path5 = "E:/Uni/SoSe21/Forschungsmodull Datenbanken/Code/rt-5.json"
path1 = "E:/Uni/SoSe21/Forschungsmodull Datenbanken/Code/rt-1.json"
pathsample_2 = "E:/Uni/SoSe21/Forschungsmodull Datenbanken/Code/Tweets/sample_2014-06-24-03-53-52.189+0200.json"
rt_1 = spark.read.json(path1)
rt_5 = spark.read.json(path5)
rt_10 = spark.read.json(path10)
rt_10_100 = spark.read.json(path10_100)
rt_5_100 = spark.read.json(path5_100)
sample2 = spark.read.json(pathsample_2)
#tweetsDF.printSchema()
#Erstellen einer neuen temporärern Tabelle und ausgeben mit der Methode show
#tweetsDF.createOrReplaceTempView("tweets")
#twitternickname = spark.sql("SELECT text FROM tweets WHERE contributors is not NULL")
#twitternickname.show()
"""
rt_1.createOrReplaceTempView("Follower")
famous1 = spark.sql("SELECT id, user.name, user.time_zone FROM Follower WHERE user.followers_count >= 1000000")
famous1.show(10, False)
rt_5.createOrReplaceTempView("Follower")
famous2 = spark.sql("SELECT id, user.name, user.time_zone FROM Follower WHERE user.followers_count >= 1000000")
famous2.show(10, False)
rt_10.createOrReplaceTempView("Follower")
famous3 = spark.sql("SELECT id, user.name, user.time_zone FROM Follower WHERE user.followers_count >= 1000000")
famous3.show(10, False)
rt_10_100.createOrReplaceTempView("Follower")
famous4 = spark.sql("SELECT id, user.name, user.time_zone FROM Follower WHERE user.followers_count >= 1000000")
famous4.show(10, False)
"""
sample2.createOrReplaceTempView("Follower")
famous5 = spark.sql("SELECT id, user.name, text, user.followers_count FROM Follower WHERE user.followers_count >= 1000000 ORDER BY user.followers_count DESC")
famous5.show(100, False)
"""
rt_1.createOrReplaceTempView("TweetsofFamousAccount")
tweets = spark.sql("SELECT text FROM TweetsofFamousAccount WHERE id = 231658792209743872")
tweets.show(20, False)
rt_5.createOrReplaceTempView("TweetsofFamousAccount")
tweets2 = spark.sql("SELECT text FROM TweetsofFamousAccount WHERE id = 231658792209743872")
tweets2.show(20, False)
rt_10.createOrReplaceTempView("TweetsofFamousAccount")
tweets3 = spark.sql("SELECT text FROM TweetsofFamousAccount WHERE id = 231658792209743872")
tweets3.show(20, False)
"""
#eine Zeile nur verwenden und dann weiter probieren
#jqube
#am ende werden nur wenige attribute relevant sein
#log beachten
#zuerst kleinere datenmengen verwenden
#für spark java 8 und 11