add Aufgabe 6-8

2026-02-04 08:45:55 +01:00 · 2025-11-14 10:01:36 +01:00
parent 95345cacb0
commit 14a209182f
5 changed files with 1082 additions and 0 deletions
--- a/8/main.py
+++ b/8/main.py
@@ -0,0 +1,220 @@
+from sparkstart import scon, spark
+import ghcnd_stations
+import matplotlib.pyplot as plt
+import time
+
+# a) Liste aller Stationen sortiert nach Stationsname
+def get_all_stations():
+    start = time.time()
+    result = spark.sql("SELECT * FROM stations ORDER BY name")
+    result.show()
+    end = time.time()
+    print(f"Zeit: {end - start}")
+    # Zweite Ausführung
+    start = time.time()
+    result = spark.sql("SELECT * FROM stations ORDER BY name")
+    result.show()
+    end = time.time()
+    print(f"Zeit zweite Ausführung: {end - start}")
+
+# b) Anzahl der Stationen je Land
+def get_station_count_per_country():
+    start = time.time()
+    result = spark.sql("""
+        SELECT c.country_code, c.name, COUNT(s.id) as count 
+        FROM stations s 
+        JOIN ghcndcountries c ON s.country = c.country_code 
+        GROUP BY c.country_code, c.name 
+        ORDER BY count DESC
+    """)
+    result.show(truncate=False)
+    end = time.time()
+    print(f"Zeit: {end - start}")
+    # Zweite
+    start = time.time()
+    result = spark.sql("""
+        SELECT c.country_code, c.name, COUNT(s.id) as count 
+        FROM stations s 
+        JOIN ghcndcountries c ON s.country = c.country_code 
+        GROUP BY c.country_code, c.name 
+        ORDER BY count DESC
+    """)
+    result.show(truncate=False)
+    end = time.time()
+    print(f"Zeit zweite: {end - start}")
+
+# c) Stationen in Deutschland
+def get_german_stations():
+    start = time.time()
+    result = spark.sql("SELECT * FROM stations WHERE country = 'GM' ORDER BY name")
+    result.show()
+    end = time.time()
+    print(f"Zeit: {end - start}")
+    # Zweite
+    start = time.time()
+    result = spark.sql("SELECT * FROM stations WHERE country = 'GM' ORDER BY name")
+    result.show()
+    end = time.time()
+    print(f"Zeit zweite: {end - start}")
+
+# d) Plot TMAX und TMIN für Station und Jahr
+def plot_temp_day(station_name, year):
+    # Station ID finden
+    station_id = spark.sql(f"SELECT id FROM stations WHERE name = '{station_name}'").collect()[0][0]
+    # Daten filtern
+    df_filtered = spark.sql(f"""
+        SELECT date, TMAX, TMIN FROM ghcnd_data 
+        WHERE station = '{station_id}' AND year(date) = {year}
+        ORDER BY date
+    """).toPandas()
+    # Temperaturen in Grad umrechnen
+    df_filtered['TMAX'] /= 10
+    df_filtered['TMIN'] /= 10
+    # Tage des Jahres
+    df_filtered['day_of_year'] = df_filtered['date'].dt.dayofyear
+    plt.plot(df_filtered['day_of_year'], df_filtered['TMAX'], 'r', label='TMAX')
+    plt.plot(df_filtered['day_of_year'], df_filtered['TMIN'], 'b', label='TMIN')
+    plt.xlabel('Tag des Jahres')
+    plt.ylabel('Temperatur (°C)')
+    plt.title(f'{station_name} {year}')
+    plt.legend()
+    plt.show()
+
+# e) Gesamt-Niederschlag pro Jahr für Station
+def plot_precip_year(station_name):
+    station_id = spark.sql(f"SELECT id FROM stations WHERE name = '{station_name}'").collect()[0][0]
+    df_precip = spark.sql(f"""
+        SELECT year(date) as year, SUM(PRCP)/10 as total_precip 
+        FROM ghcnd_data 
+        WHERE station = '{station_id}' 
+        GROUP BY year(date) 
+        ORDER BY year
+    """).toPandas()
+    plt.bar(df_precip['year'], df_precip['total_precip'])
+    plt.xlabel('Jahr')
+    plt.ylabel('Niederschlag (mm)')
+    plt.title(f'Gesamt-Niederschlag {station_name}')
+    plt.show()
+
+# f) Durchschnitt TMAX pro Tag des Jahres, mit 21-Tage Durchschnitt
+def plot_avg_tmax_day(station_name):
+    station_id = spark.sql(f"SELECT id FROM stations WHERE name = '{station_name}'").collect()[0][0]
+    df_avg = spark.sql(f"""
+        SELECT dayofyear(date) as day, AVG(TMAX)/10 as avg_tmax 
+        FROM ghcnd_data 
+        WHERE station = '{station_id}' 
+        GROUP BY dayofyear(date) 
+        ORDER BY day
+    """).toPandas()
+    # 21-Tage Durchschnitt
+    df_avg['rolling_avg'] = df_avg['avg_tmax'].rolling(21, center=True).mean()
+    plt.plot(df_avg['day'], df_avg['avg_tmax'], label='Täglich')
+    plt.plot(df_avg['day'], df_avg['rolling_avg'], label='21-Tage')
+    plt.xlabel('Tag des Jahres')
+    plt.ylabel('Durchschnitt TMAX (°C)')
+    plt.title(f'Durchschnitt TMAX {station_name}')
+    plt.legend()
+    plt.show()
+
+# g) Durchschnitt TMAX und TMIN pro Jahr für Station
+def plot_temp_year(station_name):
+    station_id = spark.sql(f"SELECT id FROM stations WHERE name = '{station_name}'").collect()[0][0]
+    df_temp = spark.sql(f"""
+        SELECT year(date) as year, AVG(TMAX)/10 as avg_tmax, AVG(TMIN)/10 as avg_tmin 
+        FROM ghcnd_data 
+        WHERE station = '{station_id}' 
+        GROUP BY year(date) 
+        ORDER BY year
+    """).toPandas()
+    plt.plot(df_temp['year'], df_temp['avg_tmax'], 'r', label='TMAX')
+    plt.plot(df_temp['year'], df_temp['avg_tmin'], 'b', label='TMIN')
+    plt.xlabel('Jahr')
+    plt.ylabel('Temperatur (°C)')
+    plt.title(f'Temperatur {station_name}')
+    plt.legend()
+    plt.show()
+
+# h) Durchschnitt TMAX pro Jahr und 20-Jahre Durchschnitt
+def plot_tmax_trend(station_name):
+    station_id = spark.sql(f"SELECT id FROM stations WHERE name = '{station_name}'").collect()[0][0]
+    df_trend = spark.sql(f"""
+        SELECT year(date) as year, AVG(TMAX)/10 as avg_tmax 
+        FROM ghcnd_data 
+        WHERE station = '{station_id}' 
+        GROUP BY year(date) 
+        ORDER BY year
+    """).toPandas()
+    # 20-Jahre Durchschnitt
+    df_trend['rolling_avg'] = df_trend['avg_tmax'].rolling(20, center=True).mean()
+    plt.plot(df_trend['year'], df_trend['avg_tmax'], label='Jährlich')
+    plt.plot(df_trend['year'], df_trend['rolling_avg'], label='20-Jahre')
+    plt.xlabel('Jahr')
+    plt.ylabel('Durchschnitt TMAX (°C)')
+    plt.title(f'TMAX Trend {station_name}')
+    plt.legend()
+    plt.show()
+
+# i) Korrelation TMIN und TMAX pro Jahr
+def plot_corr_temp(station_name):
+    station_id = spark.sql(f"SELECT id FROM stations WHERE name = '{station_name}'").collect()[0][0]
+    df_corr = spark.sql(f"""
+        SELECT year(date) as year, corr(TMIN, TMAX) as correlation 
+        FROM (
+            SELECT date, TMIN, TMAX 
+            FROM ghcnd_data 
+            WHERE station = '{station_id}' AND TMIN IS NOT NULL AND TMAX IS NOT NULL
+        ) 
+        GROUP BY year(date) 
+        ORDER BY year
+    """).toPandas()
+    plt.plot(df_corr['year'], df_corr['correlation'])
+    plt.xlabel('Jahr')
+    plt.ylabel('Korrelation TMIN-TMAX')
+    plt.title(f'Korrelation {station_name}')
+    plt.show()
+
+def main(scon, spark):
+    # Daten laden
+    ghcnd_stations.read_ghcnd_from_parquet(spark)
+    
+    # a) Liste aller Stationen
+    get_all_stations()
+    
+    # b) Anzahl Stationen je Land
+    get_station_count_per_country()
+    
+    # c) Stationen in Deutschland
+    get_german_stations()
+    
+    # d) Plot für Kempten, Hohenpeissenberg, Zugspitze
+    plot_temp_day('KEMPTEN', 2020)
+    plot_temp_day('HOHENPEISSENBERG', 2020)
+    plot_temp_day('ZUGSPITZE', 2020)
+    
+    # e) Niederschlag
+    plot_precip_year('KEMPTEN')
+    plot_precip_year('HOHENPEISSENBERG')
+    plot_precip_year('ZUGSPITZE')
+    
+    # f) Durchschnitt TMAX
+    plot_avg_tmax_day('KEMPTEN')
+    plot_avg_tmax_day('HOHENPEISSENBERG')
+    plot_avg_tmax_day('ZUGSPITZE')
+    
+    # g) Temperatur pro Jahr
+    plot_temp_year('KEMPTEN')
+    plot_temp_year('HOHENPEISSENBERG')
+    plot_temp_year('ZUGSPITZE')
+    
+    # h) TMAX Trend
+    plot_tmax_trend('KEMPTEN')
+    plot_tmax_trend('HOHENPEISSENBERG')
+    plot_tmax_trend('ZUGSPITZE')
+    
+    # i) Korrelation
+    plot_corr_temp('KEMPTEN')
+    plot_corr_temp('HOHENPEISSENBERG')
+    plot_corr_temp('ZUGSPITZE')
+
+if __name__ == "__main__":
+    main(scon, spark)