mirror of
https://github.com/Vale54321/BigData.git
synced 2025-12-11 09:59:33 +01:00
add Aufgabe 6-8
This commit is contained in:
92
Aufgabe 7/main.py
Normal file
92
Aufgabe 7/main.py
Normal file
@@ -0,0 +1,92 @@
|
||||
def readAllLanguages(scone, lang_file):
|
||||
return (scone.textFile(lang_file)
|
||||
.map(lambda line: line.strip().split(","))
|
||||
.filter(lambda x: len(x) == 2)
|
||||
.map(lambda x: (x[1].lower(), x[0].lower())))
|
||||
|
||||
def readAllTexts(scon, directory, partitions=1000):
|
||||
import os
|
||||
|
||||
rdd = scon.wholeTextFiles(directory, minPartitions=partitions)
|
||||
rdd = rdd.map(lambda x: (os.path.basename(x[0]), x[1]))
|
||||
return rdd
|
||||
|
||||
|
||||
def clean_and_split(rdd):
|
||||
from string import punctuation
|
||||
|
||||
def clean_text(text):
|
||||
for ch in punctuation + "\n\t\r":
|
||||
text = text.replace(ch, " ")
|
||||
return text
|
||||
|
||||
return (rdd.flatMap(lambda x: [(x[0], w.lower())
|
||||
for w in clean_text(x[1]).split(" ")
|
||||
if w.strip() != ""]))
|
||||
|
||||
|
||||
def unique_words_per_file(word_rdd):
|
||||
return (word_rdd.distinct()
|
||||
.map(lambda x: (x[1], x[0])))
|
||||
|
||||
|
||||
def join_with_languages(word_file_rdd, lang_rdd):
|
||||
return word_file_rdd.join(lang_rdd)
|
||||
|
||||
|
||||
|
||||
def count_words_per_language(joined_rdd):
|
||||
return (joined_rdd
|
||||
.map(lambda x: ((x[1][0], x[1][1]), 1))
|
||||
.reduceByKey(lambda a, b: a + b))
|
||||
|
||||
|
||||
|
||||
def detect_language_per_file(count_rdd):
|
||||
return (count_rdd
|
||||
.map(lambda x: (x[0][0], (x[0][1], x[1])))
|
||||
.reduceByKey(lambda a, b: a if a[1] > b[1] else b)
|
||||
.map(lambda x: (x[0], x[1][0])))
|
||||
|
||||
|
||||
def count_texts_per_language(lang_detected_rdd):
|
||||
return (lang_detected_rdd
|
||||
.map(lambda x: (x[1], 1))
|
||||
.reduceByKey(lambda a, b: a + b)
|
||||
.sortBy(lambda x: -x[1]))
|
||||
|
||||
|
||||
def detect_languages(scon, text_dir, lang_file, partitions=1000):
|
||||
"""
|
||||
detect_languages(scon, "/data/texte/txt", "/data/texte/languages.txt")
|
||||
"""
|
||||
import time
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
lang_rdd = readAllLanguages(scon, lang_file)
|
||||
|
||||
texts_rdd = readAllTexts(scon, text_dir, partitions=partitions)
|
||||
|
||||
words_rdd = clean_and_split(texts_rdd)
|
||||
unique_rdd = unique_words_per_file(words_rdd)
|
||||
joined_rdd = join_with_languages(unique_rdd, lang_rdd)
|
||||
counts_rdd = count_words_per_language(joined_rdd)
|
||||
detected_rdd = detect_language_per_file(counts_rdd)
|
||||
summary_rdd = count_texts_per_language(detected_rdd)
|
||||
|
||||
detected = detected_rdd.collect()
|
||||
summary = summary_rdd.collect()
|
||||
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
|
||||
print(f"Abgeschlossen in {duration:.2f} Sekunden\n")
|
||||
|
||||
print("Erkannte Sprachen pro Datei:")
|
||||
for f, lang in detected[:20]:
|
||||
print(f"{f}: {lang}")
|
||||
|
||||
print("\nGesamtanzahl Texte pro Sprache:")
|
||||
for lang, count in summary:
|
||||
print(f"{lang}: {count}")
|
||||
Reference in New Issue
Block a user