def readAllLanguages(scone, lang_file): return (scone.textFile(lang_file) .map(lambda line: line.strip().split(",")) .filter(lambda x: len(x) == 2) .map(lambda x: (x[1].lower(), x[0].lower()))) def readAllTexts(scon, directory, partitions=1000): import os rdd = scon.wholeTextFiles(directory, minPartitions=partitions) rdd = rdd.map(lambda x: (os.path.basename(x[0]), x[1])) return rdd def clean_and_split(rdd): from string import punctuation def clean_text(text): for ch in punctuation + "\n\t\r": text = text.replace(ch, " ") return text return (rdd.flatMap(lambda x: [(x[0], w.lower()) for w in clean_text(x[1]).split(" ") if w.strip() != ""])) def unique_words_per_file(word_rdd): return (word_rdd.distinct() .map(lambda x: (x[1], x[0]))) def join_with_languages(word_file_rdd, lang_rdd): return word_file_rdd.join(lang_rdd) def count_words_per_language(joined_rdd): return (joined_rdd .map(lambda x: ((x[1][0], x[1][1]), 1)) .reduceByKey(lambda a, b: a + b)) def detect_language_per_file(count_rdd): return (count_rdd .map(lambda x: (x[0][0], (x[0][1], x[1]))) .reduceByKey(lambda a, b: a if a[1] > b[1] else b) .map(lambda x: (x[0], x[1][0]))) def count_texts_per_language(lang_detected_rdd): return (lang_detected_rdd .map(lambda x: (x[1], 1)) .reduceByKey(lambda a, b: a + b) .sortBy(lambda x: -x[1])) def detect_languages(scon, text_dir, lang_file, partitions=1000): """ detect_languages(scon, "/data/texte/txt", "/data/texte/languages.txt") """ import time start_time = time.time() lang_rdd = readAllLanguages(scon, lang_file) texts_rdd = readAllTexts(scon, text_dir, partitions=partitions) words_rdd = clean_and_split(texts_rdd) unique_rdd = unique_words_per_file(words_rdd) joined_rdd = join_with_languages(unique_rdd, lang_rdd) counts_rdd = count_words_per_language(joined_rdd) detected_rdd = detect_language_per_file(counts_rdd) summary_rdd = count_texts_per_language(detected_rdd) detected = detected_rdd.collect() summary = summary_rdd.collect() end_time = time.time() duration = end_time - start_time print(f"Abgeschlossen in {duration:.2f} Sekunden\n") print("Erkannte Sprachen pro Datei:") for f, lang in detected[:20]: print(f"{f}: {lang}") print("\nGesamtanzahl Texte pro Sprache:") for lang, count in summary: print(f"{lang}: {count}")