From 2f1e9c5b93ddf06e1cd498753a2c4dd780837f6e Mon Sep 17 00:00:00 2001 From: Valentin Heiserer Date: Fri, 31 Oct 2025 11:49:16 +0100 Subject: [PATCH] Aufgabe 4 --- Aufgabe 4/main.py | 90 +++++++++++++++++++++++++++++++++++++++++ Aufgabe 4/sparkstart.py | 22 ++++++++++ 2 files changed, 112 insertions(+) create mode 100644 Aufgabe 4/main.py create mode 100644 Aufgabe 4/sparkstart.py diff --git a/Aufgabe 4/main.py b/Aufgabe 4/main.py new file mode 100644 index 0000000..392bcfa --- /dev/null +++ b/Aufgabe 4/main.py @@ -0,0 +1,90 @@ +from sparkstart import scon +import os +import matplotlib.pyplot as plt + +#--------------------------------Aufgabe a---------------------------------- +print("\nAufgabe a)") +mylist = list(range(1,101)) +rd1 = scon.parallelize(mylist) + +mylist = list(range(50,151)) +rd2 = scon.parallelize(mylist) + +print(rd1.takeOrdered(22)) +print(rd2.takeOrdered(101)) + +rd1.count() +rd2.count() + +rd1.first() +rd2.first() + +rd1.takeSample(False,10) +rd2.takeSample(False,10) + + +#--------------------------------Aufgabe b---------------------------------- +print("\nAufgabe b)") +rd1_div_5_and_7 = rd1.filter(lambda x: (x % 5 == 0) and (x % 7 == 0)) +print(rd1_div_5_and_7.takeOrdered(100)) + + +#--------------------------------Aufgabe c---------------------------------- +print("\nAufgabe c)") +intersection = rd1.intersection(rd2).sortBy(lambda x: x) +union_set = rd1.union(rd2).distinct().sortBy(lambda x: x) +diff_rd1_minus_rd2 = rd1.subtract(rd2).sortBy(lambda x: x) +diff_rd2_minus_rd1 = rd2.subtract(rd1).sortBy(lambda x: x) +cartesian_product = rd1.cartesian(rd2).sortBy(lambda p: (p[0], p[1])) + +print("\nDurchschnittsmenge:") +intersection_vals = intersection.collect() +print(intersection_vals) +print("Anzahl:", len(intersection_vals)) + +print("\nVereinigungsmenge:") +union_vals = union_set.collect() +print(union_vals) +print("Anzahl:", len(union_vals)) + +print("\nDifferenzmenge:") +diff1_vals = diff_rd1_minus_rd2.collect() +print(diff1_vals) +print("Anzahl:", len(diff1_vals)) + +print("\nDifferenzmenge:") +diff2_vals = diff_rd2_minus_rd1.collect() +print(diff2_vals) +print("Anzahl:", len(diff2_vals)) + +print("\nKreuzprodukt:") +cart_vals = cartesian_product.collect() +print(cart_vals) +print("Anzahl:", len(cart_vals)) + + +#--------------------------------Aufgabe d---------------------------------- +rd3 = rd1.map(lambda x: (x, 1.0 / x)) +sum_rd3 = rd3.map(lambda kv: kv[1]).sum() +print("\nSumme:", sum_rd3) + +rd3_sorted = rd3.sortByKey().collect() +xs = [x for x, _ in rd3_sorted] +ys = [y for _, y in rd3_sorted] + +plt.figure(figsize=(8, 4)) +plt.plot(xs, ys, marker='o', linestyle='-', color='tab:blue') +plt.title('1/x über x (rd1)') +plt.xlabel('x') +plt.ylabel('1/x') +plt.grid(True, alpha=0.3) + +plt.tight_layout() +plt.close() + +#--------------------------------Aufgabe e---------------------------------- +product_rd2 = rd2.fold(1, lambda a, b: a * b) +print("\nAufgabe e)") +print(product_rd2) + +scon.stop() diff --git a/Aufgabe 4/sparkstart.py b/Aufgabe 4/sparkstart.py new file mode 100644 index 0000000..bdb7010 --- /dev/null +++ b/Aufgabe 4/sparkstart.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +""" +Erzeugen einer Spark-Konfiguration +""" + +from pyspark import SparkConf, SparkContext +from pyspark.sql import SparkSession + +# connect to cluster +conf = SparkConf().setMaster("spark://193.174.205.250:7077").setAppName("HeisererValentin") +conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") +conf.set("spark.executor.memory", '32g') +conf.set("spark.driver.memory", '8g') +conf.set("spark.cores.max", "40") +scon = SparkContext(conf=conf) + + +spark = SparkSession \ + .builder \ + .appName("Python Spark SQL") \ + .getOrCreate()