mirror of
https://github.com/Vale54321/BigData.git
synced 2025-12-11 09:59:33 +01:00
Merge branch 'main' of https://github.com/Vale54321/BigData
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
def readAllLanguages(scone, lang_file):
|
||||
return (scone.textFile(lang_file)
|
||||
from sparkstart import scon, spark
|
||||
|
||||
def readAllLanguages(scon, lang_file):
|
||||
return (scon.textFile(lang_file)
|
||||
.map(lambda line: line.strip().split(","))
|
||||
.filter(lambda x: len(x) == 2)
|
||||
.map(lambda x: (x[1].lower(), x[0].lower())))
|
||||
@@ -57,9 +59,6 @@ def count_texts_per_language(lang_detected_rdd):
|
||||
|
||||
|
||||
def detect_languages(scon, text_dir, lang_file, partitions=1000):
|
||||
"""
|
||||
detect_languages(scon, "/data/texte/txt", "/data/texte/languages.txt")
|
||||
"""
|
||||
import time
|
||||
|
||||
start_time = time.time()
|
||||
@@ -89,4 +88,12 @@ def detect_languages(scon, text_dir, lang_file, partitions=1000):
|
||||
|
||||
print("\nGesamtanzahl Texte pro Sprache:")
|
||||
for lang, count in summary:
|
||||
print(f"{lang}: {count}")
|
||||
print(f"{lang}: {count}")
|
||||
|
||||
def main(scon, spark):
|
||||
detect_languages(scon,
|
||||
text_dir="/data/texte/txt",
|
||||
lang_file="/data/texte/languages.txt",)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(scon, spark)
|
||||
22
Aufgabe 7/sparkstart.py
Normal file
22
Aufgabe 7/sparkstart.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Erzeugen einer Spark-Konfiguration
|
||||
"""
|
||||
|
||||
from pyspark import SparkConf, SparkContext
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
# connect to cluster
|
||||
conf = SparkConf().setMaster("spark://193.174.205.250:7077").setAppName("HeisererValentin")
|
||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
conf.set("spark.executor.memory", '32g')
|
||||
conf.set("spark.driver.memory", '8g')
|
||||
conf.set("spark.cores.max", "40")
|
||||
scon = SparkContext(conf=conf)
|
||||
|
||||
|
||||
spark = SparkSession \
|
||||
.builder \
|
||||
.appName("Python Spark SQL") \
|
||||
.getOrCreate()
|
||||
Reference in New Issue
Block a user