-
Notifications
You must be signed in to change notification settings - Fork 0
/
allTweets.py
23 lines (19 loc) · 845 Bytes
/
allTweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.getOrCreate()
#jsonFiles = "C:/Users/ArtzT/Documents/Visual Studio Code/Forschungsmodul/jsons/*.json"
jsonFiles = "D:/Uni/6.Semester/ForschungsmodullDatenbanken/Tweets/2014_soccer_champoinship/Samples/sample_2014-06-12-20-25-08.820+0200.json"
df = spark.read.json(jsonFiles)
referees = df.select("user.lang", "retweeted_status.text")\
.withColumn("text", lower(col("text")))
#.groupBy("text")\
#.count()\
#.sort("count", ascending=False)
referees = referees.withColumnRenamed("text", "Text")
referees.toPandas().to_csv("Desktop/country_text_ALL.csv")