This commit is contained in:
2025-11-14 10:01:38 +01:00
6 changed files with 273 additions and 124 deletions

View File

@@ -1,5 +1,7 @@
def readAllLanguages(scone, lang_file):
return (scone.textFile(lang_file)
from sparkstart import scon, spark
def readAllLanguages(scon, lang_file):
return (scon.textFile(lang_file)
.map(lambda line: line.strip().split(","))
.filter(lambda x: len(x) == 2)
.map(lambda x: (x[1].lower(), x[0].lower())))
@@ -57,9 +59,6 @@ def count_texts_per_language(lang_detected_rdd):
def detect_languages(scon, text_dir, lang_file, partitions=1000):
"""
detect_languages(scon, "/data/texte/txt", "/data/texte/languages.txt")
"""
import time
start_time = time.time()
@@ -89,4 +88,12 @@ def detect_languages(scon, text_dir, lang_file, partitions=1000):
print("\nGesamtanzahl Texte pro Sprache:")
for lang, count in summary:
print(f"{lang}: {count}")
print(f"{lang}: {count}")
def main(scon, spark):
detect_languages(scon,
text_dir="/data/texte/txt",
lang_file="/data/texte/languages.txt",)
if __name__ == "__main__":
main(scon, spark)

22
Aufgabe 7/sparkstart.py Normal file
View File

@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
"""
Erzeugen einer Spark-Konfiguration
"""
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
# connect to cluster
conf = SparkConf().setMaster("spark://193.174.205.250:7077").setAppName("HeisererValentin")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.executor.memory", '32g')
conf.set("spark.driver.memory", '8g')
conf.set("spark.cores.max", "40")
scon = SparkContext(conf=conf)
spark = SparkSession \
.builder \
.appName("Python Spark SQL") \
.getOrCreate()