from __future__ import annotations from sparkstart import scon, spark from pyspark.sql import SparkSession import matplotlib.pyplot as plt HDFSPATH = "hdfs://193.174.205.250:54310/" def read_parquet_tables(spark: SparkSession) -> None: """Load station master data and hourly measurements from parquet if needed.""" stations_path = HDFSPATH + "home/heiserervalentin/german_stations.parquet" products_path = HDFSPATH + "home/heiserervalentin/german_stations_data.parquet" stations_df = spark.read.parquet(stations_path) stations_df.createOrReplaceTempView("german_stations") stations_df.cache() products_df = spark.read.parquet(products_path) products_df.createOrReplaceTempView("german_stations_data") products_df.cache() def _escape_like(value: str) -> str: """Escape single quotes for safe SQL literal usage.""" return value.replace("'", "''") def resolve_station_id(spark: SparkSession, station_identifier) -> int: """Resolve station id either from int input or fuzzy name search.""" if isinstance(station_identifier, int): return station_identifier if isinstance(station_identifier, str) and station_identifier.strip().isdigit(): return int(station_identifier.strip()) if isinstance(station_identifier, str): needle = _escape_like(station_identifier.lower()) q = ( "SELECT stationId FROM german_stations " f"WHERE lower(station_name) LIKE '%{needle}%' ORDER BY station_name LIMIT 1" ) result = spark.sql(q).collect() if not result: raise ValueError(f"No station found for pattern '{station_identifier}'") return int(result[0]["stationId"]) raise ValueError("station_identifier must be int or str") def build_station_rollup_for_station(spark: SparkSession, station_identifier) -> None: """Create rollup view with min/max/avg per hour/day/month/quarter/year.""" station_id = resolve_station_id(spark, station_identifier) q = f""" WITH base AS ( SELECT d.stationId, gs.station_name, TO_TIMESTAMP(CONCAT(d.date, LPAD(CAST(d.hour AS STRING), 2, '0')), 'yyyyMMddHH') AS hour_ts, TO_DATE(d.date, 'yyyyMMdd') AS day_date, MONTH(TO_DATE(d.date, 'yyyyMMdd')) AS month_in_year, QUARTER(TO_DATE(d.date, 'yyyyMMdd')) AS quarter_in_year, YEAR(TO_DATE(d.date, 'yyyyMMdd')) AS year_value, d.TT_TU AS temperature FROM german_stations_data d JOIN german_stations gs ON d.stationId = gs.stationId WHERE d.stationId = {station_id} AND d.TT_TU IS NOT NULL AND d.TT_TU <> -999 ), rollup_base AS ( SELECT stationId, station_name, hour_ts, day_date, month_in_year, quarter_in_year, year_value, MIN(temperature) AS min_temp, MAX(temperature) AS max_temp, AVG(temperature) AS avg_temp FROM base GROUP BY stationId, station_name, ROLLUP(year_value, quarter_in_year, month_in_year, day_date, hour_ts) ) SELECT stationId, station_name, hour_ts, day_date, month_in_year, quarter_in_year, year_value, CASE WHEN month_in_year IS NOT NULL THEN TO_DATE(CONCAT(CAST(year_value AS STRING), '-', LPAD(CAST(month_in_year AS STRING), 2, '0'), '-01')) END AS month_start_date, CASE WHEN quarter_in_year IS NOT NULL THEN TO_DATE(CONCAT(CAST(year_value AS STRING), '-', LPAD(CAST(quarter_in_year * 3 - 2 AS STRING), 2, '0'), '-01')) END AS quarter_start_date, CASE WHEN year_value IS NOT NULL THEN TO_DATE(CONCAT(CAST(year_value AS STRING), '-01-01')) END AS year_start_date, min_temp, max_temp, avg_temp FROM rollup_base """ rollup_df = spark.sql(q) rollup_df.cache() rollup_df.createOrReplaceTempView("station_rollup") def _year_window(spark: SparkSession, years_back: int, station_id: int) -> tuple[int, int] | None: stats = spark.sql( f"SELECT MIN(year_value) AS min_year, MAX(year_value) AS max_year FROM station_rollup WHERE year_value IS NOT NULL AND stationId = {station_id}" ).collect() if not stats or stats[0]["max_year"] is None: return None min_year = int(stats[0]["min_year"]) max_year = int(stats[0]["max_year"]) start_year = max(min_year, max_year - years_back + 1) return start_year, max_year def plot_station_rollup_levels( spark: SparkSession, station_identifier, day_span_years: int = 3, agg_span_years: int = 15, ) -> None: """Plot day, month, quarter, and year aggregates for the given station.""" station_id = resolve_station_id(spark, station_identifier) needs_refresh = not spark.catalog.tableExists("station_rollup") if not needs_refresh: count = spark.sql( f"SELECT COUNT(*) AS cnt FROM station_rollup WHERE stationId = {station_id}" ).collect()[0]["cnt"] needs_refresh = count == 0 if needs_refresh: build_station_rollup_for_station(spark, station_id) day_window = _year_window(spark, day_span_years, station_id) if day_window is None: print("No data available for plotting") return month_window = _year_window(spark, agg_span_years, station_id) if month_window is None: print("No aggregated window available") return def _plot(query: str, figure_idx: int, title: str, x_col: str = "bucket_date") -> None: pdf = spark.sql(query).toPandas() if pdf.empty: print(f"No data for {title}") return plt.figure(num=figure_idx) plt.clf() metrics = [ ("min_temp", "Min", "#1f77b4"), ("avg_temp", "Avg", "#ff7f0e"), ("max_temp", "Max", "#2ca02c"), ] for col, label, color in metrics: if col in pdf: plt.plot(pdf[x_col], pdf[col], label=label, color=color) plt.title(title) plt.xlabel("Datum") plt.ylabel("Temperatur (°C)") plt.legend() plt.tight_layout() plt.show() day_start, day_end = day_window q_day = f""" SELECT day_date AS bucket_date, min_temp, avg_temp, max_temp FROM station_rollup WHERE stationId = {station_id} AND hour_ts IS NULL AND day_date IS NOT NULL AND year_value BETWEEN {day_start} AND {day_end} ORDER BY bucket_date """ _plot(q_day, 1, f"Tagesmittelwerte {day_start}-{day_end}") agg_start, agg_end = month_window q_month = f""" SELECT month_start_date AS bucket_date, min_temp, avg_temp, max_temp FROM station_rollup WHERE stationId = {station_id} AND day_date IS NULL AND month_in_year IS NOT NULL AND year_value BETWEEN {agg_start} AND {agg_end} ORDER BY bucket_date """ _plot(q_month, 2, f"Monatsmittelwerte {agg_start}-{agg_end}") q_quarter = f""" SELECT quarter_start_date AS bucket_date, min_temp, avg_temp, max_temp FROM station_rollup WHERE stationId = {station_id} AND month_in_year IS NULL AND quarter_in_year IS NOT NULL AND year_value BETWEEN {agg_start} AND {agg_end} ORDER BY bucket_date """ _plot(q_quarter, 3, f"Quartalsmittelwerte {agg_start}-{agg_end}") q_year = f""" SELECT year_start_date AS bucket_date, min_temp, avg_temp, max_temp FROM station_rollup WHERE stationId = {station_id} AND quarter_in_year IS NULL AND year_value IS NOT NULL ORDER BY bucket_date """ _plot(q_year, 4, "Jahresmittelwerte") def create_tempmonat(spark: SparkSession) -> None: """Create cached temp table tempmonat with monthly aggregates per station.""" q = """ SELECT d.stationId, gs.station_name, YEAR(TO_DATE(d.date, 'yyyyMMdd')) AS year_value, MONTH(TO_DATE(d.date, 'yyyyMMdd')) AS month_value, MIN(d.TT_TU) AS min_temp, MAX(d.TT_TU) AS max_temp, AVG(d.TT_TU) AS avg_temp FROM german_stations_data d JOIN german_stations gs ON d.stationId = gs.stationId WHERE d.TT_TU IS NOT NULL AND d.TT_TU <> -999 GROUP BY d.stationId, gs.station_name, YEAR(TO_DATE(d.date, 'yyyyMMdd')), MONTH(TO_DATE(d.date, 'yyyyMMdd')) """ monthly_df = spark.sql(q) monthly_df.cache() monthly_df.createOrReplaceTempView("tempmonat") def rank_coldest_per_month_2015(spark: SparkSession): """Rank stations by coldest values per month for 2015 using tempmonat.""" return spark.sql( """ SELECT stationId, station_name, year_value, month_value, min_temp, max_temp, avg_temp, RANK() OVER (PARTITION BY month_value ORDER BY min_temp ASC) AS rank_min, RANK() OVER (PARTITION BY month_value ORDER BY max_temp ASC) AS rank_max, RANK() OVER (PARTITION BY month_value ORDER BY avg_temp ASC) AS rank_avg FROM tempmonat WHERE year_value = 2015 ORDER BY rank_min, month_value """ ) def rank_coldest_overall(spark: SparkSession): """Rank stations by coldest values over all months/years (no partition).""" return spark.sql( """ SELECT stationId, station_name, year_value, month_value, min_temp, max_temp, avg_temp, RANK() OVER (ORDER BY min_temp ASC) AS rank_min, RANK() OVER (ORDER BY max_temp ASC) AS rank_max, RANK() OVER (ORDER BY avg_temp ASC) AS rank_avg FROM tempmonat ORDER BY rank_min """ ) def create_grouping_sets_overview(spark: SparkSession) -> None: """Compute grouping sets for requested aggregations and cache the result.""" q = """ WITH base AS ( SELECT YEAR(TO_DATE(d.date, 'yyyyMMdd')) AS year_value, MONTH(TO_DATE(d.date, 'yyyyMMdd')) AS month_value, gs.bundesland, gs.stationId, gs.station_name, d.TT_TU AS temperature FROM german_stations_data d JOIN german_stations gs ON d.stationId = gs.stationId WHERE d.TT_TU IS NOT NULL AND d.TT_TU <> -999 ) SELECT year_value, month_value, bundesland, stationId, station_name, MIN(temperature) AS min_temp, MAX(temperature) AS max_temp, AVG(temperature) AS avg_temp FROM base GROUP BY GROUPING SETS ( (year_value, bundesland), (year_value, stationId, station_name, bundesland), (month_value, bundesland) ) """ grouped_df = spark.sql(q) grouped_df.cache() grouped_df.createOrReplaceTempView("grouping_sets_stats") def select_year_bundesland(spark: SparkSession): return spark.sql( """ SELECT year_value, bundesland, min_temp, max_temp, avg_temp FROM grouping_sets_stats WHERE bundesland IS NOT NULL AND month_value IS NULL AND stationId IS NULL ORDER BY year_value, bundesland """ ) def select_year_station(spark: SparkSession): return spark.sql( """ SELECT year_value, stationId, station_name, min_temp, max_temp, avg_temp FROM grouping_sets_stats WHERE stationId IS NOT NULL AND month_value IS NULL ORDER BY year_value, stationId """ ) def select_month_bundesland(spark: SparkSession): return spark.sql( """ SELECT month_value, bundesland, min_temp, max_temp, avg_temp FROM grouping_sets_stats WHERE month_value IS NOT NULL AND year_value IS NULL ORDER BY month_value, bundesland """ ) def main(scon, spark): read_parquet_tables(spark) build_station_rollup_for_station(spark, "kempten") plot_station_rollup_levels(spark, "kempten") create_tempmonat(spark) print("Rangfolgen 2015 je Monat:") rank_coldest_per_month_2015(spark).show(36, truncate=False) print("Rangfolgen gesamt:") rank_coldest_overall(spark).show(36, truncate=False) create_grouping_sets_overview(spark) print("Jahr vs Bundesland:") select_year_bundesland(spark).show(20, truncate=False) print("Jahr vs Station:") select_year_station(spark).show(20, truncate=False) print("Monat vs Bundesland:") select_month_bundesland(spark).show(20, truncate=False) if __name__ == "__main__": main(scon, spark)