bla

2026-02-04 08:45:55 +01:00 · 2025-11-28 14:14:27 +01:00
parent 296d1c8978
commit c072850289
3 changed files with 315 additions and 1014 deletions
--- a/10/Aufgabe10.py
+++ b/10/Aufgabe10.py
@@ -1,219 +1,186 @@
 from sparkstart import scon, spark
-import ghcnd_stations
-import matplotlib.pyplot as plt
+from pyspark.sql import SparkSession
 import time
+import matplotlib.pyplot as plt

-# a) Scatterplot: alle Stationen (lon/lat)
-def plot_all_stations(spark):
-    q = """
-        SELECT stationname, latitude, longitude
-        FROM ghcndstations
-        WHERE latitude IS NOT NULL AND longitude IS NOT NULL
-    """
-    t0 = time.time()
-    rows = spark.sql(q).collect()
-    t1 = time.time()
-    print(f"Ausfuehrungszeit (SQL): {t1 - t0:.3f}s -- Rows: {len(rows)}")
-
-    lats = [r['latitude'] for r in rows]
-    lons = [r['longitude'] for r in rows]
-    names = [r['stationname'] for r in rows]
-
-    plt.figure(figsize=(8,6))
-    plt.scatter(lons, lats, s=10, alpha=0.6)
-    plt.xlabel('Longitude')
-    plt.ylabel('Latitude')
-    plt.title('Alle GHCND-Stationen (Scatter)')
-    plt.grid(True)
-    plt.show()
+HDFSPATH = "hdfs://193.174.205.250:54310/"


-# b) Scatterplot: Stationsdauer in Jahren als Marker-Size (aus ghcndinventory: firstyear/lastyear)
-def plot_station_duration(spark, size_factor=20):
-    q = """
-        SELECT
-            s.stationname,
-            s.latitude,
-            s.longitude,
-            (COALESCE(i.lastyear, year(current_date())) - COALESCE(i.firstyear, year(current_date()))) AS years
-        FROM ghcndstations s
-        LEFT JOIN ghcndinventory i ON s.stationid = i.stationid
-        WHERE s.latitude IS NOT NULL AND s.longitude IS NOT NULL
-    """
-    t0 = time.time()
-    rows = spark.sql(q).collect()
-    t1 = time.time()
-    print(f"Ausfuehrungszeit (SQL): {t1 - t0:.3f}s -- Rows: {len(rows)}")
+def read_parquets(spark: SparkSession):
+	stations_path = HDFSPATH + "home/heiserervalentin/german_stations.parquet"
+	products_path = HDFSPATH + "home/heiserervalentin/german_stations_data.parquet"

-    lats = [r['latitude'] for r in rows]
-    lons = [r['longitude'] for r in rows]
-    years = [r['years'] if r['years'] is not None else 0 for r in rows]
-    sizes = [max(5, (y+1) * size_factor) for y in years]
+	stations_df = spark.read.parquet(stations_path)
+	stations_df.createOrReplaceTempView("german_stations")

-    plt.figure(figsize=(8,6))
-    plt.scatter(lons, lats, s=sizes, alpha=0.6)
-    plt.xlabel('Longitude')
-    plt.ylabel('Latitude')
-    plt.title('GHCND-Stationen: Dauer der Verfuegbarkeit (Größe ~ Jahre)')
-    plt.grid(True)
-    plt.show()
+	products_df = spark.read.parquet(products_path)
+	products_df.createOrReplaceTempView("german_stations_data")
+
+	stations_df.cache()
+	products_df.cache()


-def plot_frost_distribution_year(spark, year):
-    q = f"""
-        WITH daily_max AS (
-            SELECT stationid, date, MAX(CAST(value AS DOUBLE))/10.0 AS max_temp
-            FROM ghcnddata
-            WHERE element = 'TMAX'
-              AND length(date) >= 4
-              AND substr(date,1,4) = '{year}'
-            GROUP BY stationid, date
-        ),
-        station_frost AS (
-            SELECT dm.stationid, SUM(CASE WHEN dm.max_temp < 0 THEN 1 ELSE 0 END) AS frostdays
-            FROM daily_max dm
-            GROUP BY dm.stationid
-        )
-        SELECT sf.frostdays, COUNT(*) AS stations
-        FROM station_frost sf
-        GROUP BY sf.frostdays
-        ORDER BY sf.frostdays
-    """
-    t0 = time.time()
-    rows = spark.sql(q).collect()
-    t1 = time.time()
-    print(f"Ausfuehrungszeit (SQL): {t1 - t0:.3f}s -- Distinct frostdays: {len(rows)}")
+def plot_all_stations(spark: SparkSession):
+	q = "SELECT geo_laenge AS lon, geo_breite AS lat FROM german_stations WHERE geo_laenge IS NOT NULL AND geo_breite IS NOT NULL"
+	df = spark.sql(q)

-    if not rows:
-        print(f"Keine Daten f\u00fcr Jahr {year}.")
-        return
-
-    x = [r['frostdays'] for r in rows]
-    y = [r['stations'] for r in rows]
-
-    plt.figure(figsize=(8,5))
-    plt.bar(x, y)
-    plt.xlabel('Anzahl Frosttage im Jahr ' + str(year))
-    plt.ylabel('Anzahl Stationen')
-    plt.title(f'Verteilung der Frosttage pro Station im Jahr {year}')
-    plt.grid(True)
-    plt.show()
+	pdf = df.toPandas()
+	plt.figure(figsize=(8, 6))
+	plt.scatter(pdf.lon, pdf.lat, s=6, color='red', marker='.')
+	plt.xlabel('Longitude')
+	plt.ylabel('Latitude')
+	plt.title('All Stations (locations)')
+	plt.tight_layout()
+	plt.show()


-# c2) Frosttage Zeitreihe für eine Station mit 5- und 20-Jahres Durchschnitt (SQL window)
-def plot_station_frost_timeseries(spark, station_name):
-    q = f"""
-        WITH daily_max AS (
-            SELECT stationid, date, MAX(CAST(value AS DOUBLE))/10.0 AS max_temp
-            FROM ghcnddata
-            WHERE element = 'TMAX'
-            GROUP BY stationid, date
-        ),
-        yearly AS (
-            SELECT
-                dm.stationid,
-                CAST(substr(dm.date,1,4) AS INT) AS year,
-                SUM(CASE WHEN dm.max_temp < 0 THEN 1 ELSE 0 END) AS frostdays
-            FROM daily_max dm
-            GROUP BY dm.stationid, CAST(substr(dm.date,1,4) AS INT)
-        ),
-        station_yearly AS (
-            SELECT
-                y.year,
-                y.frostdays,
-                AVG(y.frostdays) OVER (ORDER BY y.year ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS avg5,
-                AVG(y.frostdays) OVER (ORDER BY y.year ROWS BETWEEN 19 PRECEDING AND CURRENT ROW) AS avg20
-            FROM yearly y
-            JOIN ghcndstations s ON y.stationid = s.stationid
-            WHERE trim(upper(s.stationname)) = '{station_name.upper()}'
-            ORDER BY y.year
-        )
-        SELECT * FROM station_yearly
-    """
-    t0 = time.time()
-    rows = spark.sql(q).collect()
-    t1 = time.time()
-    print(f"Ausfuehrungszeit (SQL): {t1 - t0:.3f}s -- Years: {len(rows)}")
+def duration_circle_size(spark: SparkSession):
+	q = (
+		"SELECT stationId, geo_laenge AS lon, geo_breite AS lat, "
+		"(CAST(SUBSTR(bis_datum,1,4) AS INT) - CAST(SUBSTR(von_datum,1,4) AS INT)) AS duration_years "
+		"FROM german_stations "
+		"WHERE TRIM(von_datum)<>'' AND TRIM(bis_datum)<>''"
+	)
+	df = spark.sql(q)

-    if not rows:
-        print(f"Keine Daten f\u00fcr Station '{station_name}'.")
-        return
+	pdf = df.toPandas()

-    years = [r['year'] for r in rows]
-    frostdays = [r['frostdays'] for r in rows]
-    avg5 = [r['avg5'] for r in rows]
-    avg20 = [r['avg20'] for r in rows]
+	pdf['duration_years'] = pdf['duration_years'].fillna(0).astype(int)
+	sizes = (pdf['duration_years'].clip(lower=0) + 1) * 6

-    plt.figure(figsize=(10,5))
-    plt.plot(years, frostdays, label='Frosttage (Jahr)')
-    plt.plot(years, avg5, label='5-Jahres-Durchschnitt')
-    plt.plot(years, avg20, label='20-Jahres-Durchschnitt')
-    plt.xlabel('Jahr')
-    plt.ylabel('Anzahl Frosttage')
-    plt.title(f'Frosttage f\u00fcr Station {station_name}')
-    plt.legend()
-    plt.grid(True)
-    plt.show()
+	plt.figure(figsize=(8, 6))
+	plt.scatter(pdf.lon, pdf.lat, s=sizes, alpha=0.6, c=pdf['duration_years'], cmap='viridis')
+	plt.colorbar(label='Duration (years)')
+	plt.xlabel('Longitude')
+	plt.ylabel('Latitude')
+	plt.title('Stations with duration (years) as marker size')
+	plt.tight_layout()
+	plt.show()


-# d) Korrelation Hoehe (elevation) vs. Frosttage pro Jahr
-def plot_height_frost_correlation(spark):
-    q = """
-        WITH daily_max AS (
-            SELECT stationid, date, MAX(CAST(value AS DOUBLE))/10.0 AS max_temp
-            FROM ghcnddata
-            WHERE element = 'TMAX'
-            GROUP BY stationid, date
-        ),
-        yearly AS (
-            SELECT
-                dm.stationid,
-                CAST(substr(dm.date,1,4) AS INT) AS year,
-                SUM(CASE WHEN dm.max_temp < 0 THEN 1 ELSE 0 END) AS frostdays
-            FROM daily_max dm
-            GROUP BY dm.stationid, CAST(substr(dm.date,1,4) AS INT)
-        ),
-        joined AS (
-            SELECT y.year, s.elevation, y.frostdays
-            FROM yearly y
-            JOIN ghcndstations s ON y.stationid = s.stationid
-            WHERE s.elevation IS NOT NULL
-        ),
-        yearly_corr AS (
-            SELECT year, corr(elevation, frostdays) AS corr
-            FROM joined
-            GROUP BY year
-            ORDER BY year
-        )
-        SELECT year, corr FROM yearly_corr WHERE corr IS NOT NULL
-    """
-    t0 = time.time()
-    rows = spark.sql(q).collect()
-    t1 = time.time()
-    print(f"Ausfuehrungszeit (SQL): {t1 - t0:.3f}s -- Years with corr: {len(rows)}")
+def compute_daily_and_yearly_frosts(spark: SparkSession):
+	q_daily_max = (
+		"SELECT stationId, date, SUBSTR(date,1,4) AS year, MAX(TT_TU) AS max_temp "
+		"FROM german_stations_data "
+		"GROUP BY stationId, date"
+	)
+	daily_max = spark.sql(q_daily_max)
+	daily_max.createOrReplaceTempView('daily_max')

-    if not rows:
-        print("Keine Korrelationsdaten verfügbar.")
-        return
+	# mark a day as frost if max_temp < 0
+	q_daily_frost = (
+		"SELECT stationId, year, CASE WHEN max_temp < 0 THEN 1 ELSE 0 END AS is_frost "
+		"FROM daily_max"
+	)
+	daily_frost = spark.sql(q_daily_frost)
+	daily_frost.createOrReplaceTempView('daily_frost')

-    years = [r['year'] for r in rows]
-    corr = [r['corr'] for r in rows]
+	# yearly frostdays per station
+	q_station_year = (
+		"SELECT stationId, year, SUM(is_frost) AS frost_days "
+		"FROM daily_frost GROUP BY stationId, year"
+	)
+	station_year_frost = spark.sql(q_station_year)
+	station_year_frost.createOrReplaceTempView('station_year_frost')

-    plt.figure(figsize=(10,5))
-    plt.bar(years, corr)
-    plt.xlabel('Jahr')
-    plt.ylabel('Korrelationskoeffizient (elevation vs frostdays)')
-    plt.title('Korrelation Hoehe (elevation) vs. Frosttage pro Jahr')
-    plt.grid(True)
-    plt.show()
+
+def frost_analysis(spark: SparkSession, year=2024, station_name_matches=('kempten',)):
+	compute_daily_and_yearly_frosts(spark)
+
+	q_hist = (
+		f"SELECT frost_days, COUNT(*) AS station_count "
+		f"FROM station_year_frost WHERE year = '{year}' GROUP BY frost_days ORDER BY frost_days"
+	)
+	hist_df = spark.sql(q_hist)
+
+	hist_pdf = hist_df.toPandas()
+	plt.figure(figsize=(8, 5))
+	plt.bar(hist_pdf.frost_days, hist_pdf.station_count, color='steelblue')
+	plt.xlabel('Number of Frost Days in year ' + str(year))
+	plt.ylabel('Number of Stations')
+	plt.title(f'Stations vs Frost Days ({year})')
+	plt.tight_layout()
+	plt.show()
+
+	for name in station_name_matches:
+		q_find = f"SELECT stationId, station_name FROM german_stations WHERE lower(station_name) LIKE '%{name.lower()}%'"
+		ids_df = spark.sql(q_find)
+		ids = ids_df.collect()
+		if not ids:
+			print(f"No stations found matching '{name}'")
+			continue
+		for r in ids:
+			sid = r['stationId']
+			sname = r['station_name']
+			print(f"Analyzing stationId={sid} name={sname}")
+
+			# compute frostdays + 5-yr and 20-yr rolling averages using window frame
+			q_ts = (
+				"SELECT year, frost_days, "
+				"AVG(frost_days) OVER (PARTITION BY stationId ORDER BY CAST(year AS INT) ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS avg_5, "
+				"AVG(frost_days) OVER (PARTITION BY stationId ORDER BY CAST(year AS INT) ROWS BETWEEN 19 PRECEDING AND CURRENT ROW) AS avg_20 "
+				f"FROM station_year_frost WHERE stationId = {sid} ORDER BY CAST(year AS INT)"
+			)
+			ts_df = spark.sql(q_ts)
+
+			pdf = ts_df.toPandas()
+			if pdf.empty:
+				print(f"No yearly frost data for station {sid}")
+				continue
+
+			pdf['year'] = pdf['year'].astype(int)
+			plt.figure(figsize=(10, 5))
+			plt.plot(pdf.year, pdf.frost_days, label='Frostdays (year)', marker='o')
+			plt.plot(pdf.year, pdf.avg_5, label='5-year avg', linestyle='--')
+			plt.plot(pdf.year, pdf.avg_20, label='20-year avg', linestyle=':')
+			plt.xlabel('Year')
+			plt.ylabel('Frost Days')
+			plt.title(f'Frost Days over Years for {sname} (station {sid})')
+			plt.legend()
+			plt.tight_layout()
+			plt.show()
+
+
+def height_frost_correlation(spark: SparkSession):
+	compute_daily_and_yearly_frosts(spark)
+
+	q_corr = (
+		"SELECT syf.year AS year, corr(s.hoehe, syf.frost_days) AS height_frost_corr "
+		"FROM station_year_frost syf JOIN german_stations s ON syf.stationId = s.stationId "
+		"GROUP BY syf.year ORDER BY CAST(syf.year AS INT)"
+	)
+	
+	corr_df = spark.sql(q_corr)
+
+	corr_pdf = corr_df.toPandas()
+
+	corr_pdf = corr_pdf.dropna(subset=['height_frost_corr'])
+	if corr_pdf.empty:
+		print("No non-NaN correlation values found.")
+		return
+
+	corr_pdf['year'] = corr_pdf['year'].astype(int)
+	plt.figure(figsize=(10, 5))
+	plt.bar(corr_pdf.year, corr_pdf.height_frost_corr, color='orange')
+	plt.xlabel('Year')
+	plt.ylabel('Correlation (height vs frostdays)')
+	plt.title('Yearly correlation: station height vs number of frost days')
+	plt.tight_layout()
+	plt.show()
+
+
+def main(scon, spark):
+	read_parquets(spark)
+
+	plot_all_stations(spark)
+
+	duration_circle_size(spark)
+
+	frost_analysis(spark, year=2024, station_name_matches=('kempten',))
+
+	height_frost_correlation(spark)


 if __name__ == '__main__':
-    ghcnd_stations.read_ghcnd_from_parquet(spark)
+	main(scon, spark)

-    plot_all_stations(spark)
-    plot_station_duration(spark)
-    plot_frost_distribution_year(spark, '2010')
-    plot_station_frost_timeseries(spark, 'KEMPTEN')
-    plot_height_frost_correlation(spark)
-    pass