Aufgabe 10

2026-02-04 08:45:55 +01:00 · 2025-12-11 20:39:21 +01:00
parent de3782d570
commit f89d39d420
1 changed files with 79 additions and 84 deletions
--- a/10/Aufgabe10.py
+++ b/10/Aufgabe10.py
@@ -62,7 +62,7 @@ def compute_daily_and_yearly_frosts(spark: SparkSession):
    q_daily_max = (
        "SELECT stationId, date, SUBSTR(CAST(date AS STRING),1,4) AS year, MAX(TT_TU) AS max_temp "
        "FROM german_stations_data "
-		"WHERE TT_TU IS NOT NULL "
+        "WHERE TT_TU IS NOT NULL AND TT_TU > -50 AND TT_TU < 60 "
        "GROUP BY stationId, date"
    )
    daily_max = spark.sql(q_daily_max)
@@ -88,9 +88,6 @@ def compute_daily_and_yearly_frosts(spark: SparkSession):
 def frost_analysis(spark: SparkSession, year=2024, station_name_matches=('kempten',)):
    compute_daily_and_yearly_frosts(spark)

-	# Debug: check available years and data
-	spark.sql("SELECT year, COUNT(*) as cnt FROM station_year_frost GROUP BY year ORDER BY year").show(50)
-
    q_hist = (
        f"SELECT frost_days, COUNT(*) AS station_count "
        f"FROM station_year_frost WHERE year = '{year}' GROUP BY frost_days ORDER BY frost_days"
@@ -101,7 +98,6 @@ def frost_analysis(spark: SparkSession, year=2024, station_name_matches=('kempte
    
    if hist_pdf.empty:
        print(f"No frost data found for year {year}. Trying to find available years...")
-		# Try without year filter to see if data exists
        q_all = "SELECT frost_days, COUNT(*) AS station_count FROM station_year_frost GROUP BY frost_days ORDER BY frost_days"
        hist_pdf = spark.sql(q_all).toPandas()
        if hist_pdf.empty:
@@ -129,11 +125,10 @@ def frost_analysis(spark: SparkSession, year=2024, station_name_matches=('kempte
            sname = r['station_name']
            print(f"Analyzing stationId={sid} name={sname}")

-			# compute frostdays + 5-yr and 20-yr rolling averages using window frame
            q_ts = (
                "SELECT year, frost_days, "
-				"AVG(frost_days) OVER (PARTITION BY stationId ORDER BY CAST(year AS INT) ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS avg_5, "
-				"AVG(frost_days) OVER (PARTITION BY stationId ORDER BY CAST(year AS INT) ROWS BETWEEN 19 PRECEDING AND CURRENT ROW) AS avg_20 "
+                "AVG(frost_days) OVER (PARTITION BY stationId ORDER BY CAST(year AS INT) RANGE BETWEEN 4 PRECEDING AND CURRENT ROW) AS avg_5, "
+                "AVG(frost_days) OVER (PARTITION BY stationId ORDER BY CAST(year AS INT) RANGE BETWEEN 19 PRECEDING AND CURRENT ROW) AS avg_20 "
                f"FROM station_year_frost WHERE stationId = {sid} ORDER BY CAST(year AS INT)"
            )
            ts_df = spark.sql(q_ts)