Files
BigData/Aufgabe 11/Aufgabe11.py
2025-12-18 19:57:39 +01:00

187 lines
6.6 KiB
Python

from __future__ import annotations
from sparkstart import scon, spark
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
HDFSPATH = "hdfs://193.174.205.250:54310/"
def read_parquet_tables(spark: SparkSession) -> None:
# Use your specific paths from Aufgabe 9
stations_path = HDFSPATH + "home/heiserervalentin/german_stations.parquet"
products_path = HDFSPATH + "home/heiserervalentin/german_stations_data.parquet"
spark.read.parquet(stations_path).createOrReplaceTempView("german_stations")
spark.read.parquet(products_path).createOrReplaceTempView("german_stations_data")
# --- Aufgabe A: Rollup and Plotting ---
def create_mma_rollup(spark: SparkSession, station_id: int):
"""
Functionally identical to Musterlösung 'createDataFrame'.
Uses your schema: TT_TU (temp), hour (messtunde), date (string yyyyMMdd).
"""
query = f"""
WITH processed_data AS (
SELECT
TT_TU,
hour AS messtunde,
TO_DATE(SUBSTR(date, 1, 4), 'yyyy') AS jahr,
TO_DATE(CONCAT(SUBSTR(date, 1, 4), '-', LPAD(CAST(QUARTER(TO_DATE(date, 'yyyyMMdd'))*3-2 AS STRING), 2, '0'), '-01')) AS quartal,
TO_DATE(CONCAT(SUBSTR(date, 1, 4), '-', SUBSTR(date, 5, 2), '-01')) AS monat,
TO_DATE(date, 'yyyyMMdd') AS tag
FROM german_stations_data
WHERE stationId = {station_id}
AND TT_TU IS NOT NULL
AND TT_TU <> -999
)
SELECT
MIN(TT_TU) AS minTemperatur,
MAX(TT_TU) AS maxTemperatur,
AVG(TT_TU) AS avgTemperatur,
jahr,
quartal,
monat,
tag,
messtunde
FROM processed_data
GROUP BY ROLLUP (jahr, quartal, monat, tag, messtunde)
ORDER BY jahr, quartal, monat, tag, messtunde
"""
df = spark.sql(query)
df.createOrReplaceTempView("mmacdcdata")
df.cache()
df.show(10)
def plot_date_values(spark: SparkSession, level: str):
"""Functionally identical plotting logic to Musterlösung 'plotDateValues'."""
filters = {
"days": "YEAR(jahr) > 2017 AND YEAR(jahr) < 2021 AND messtunde IS NULL AND tag IS NOT NULL",
"months": "YEAR(jahr) > 1999 AND YEAR(jahr) < 2021 AND tag IS NULL AND monat IS NOT NULL",
"quartals": "YEAR(jahr) > 1999 AND YEAR(jahr) < 2021 AND tag IS NULL AND monat IS NULL AND quartal IS NOT NULL",
"years": "YEAR(jahr) > 1999 AND YEAR(jahr) < 2021 AND tag IS NULL AND monat IS NULL AND quartal IS NULL AND jahr IS NOT NULL"
}
x_col = {"days": "tag", "months": "monat", "quartals": "quartal", "years": "jahr"}
pdf = spark.sql(f"SELECT * FROM mmacdcdata WHERE {filters[level]}").toPandas()
if pdf.empty: return
plt.figure(figsize=(10, 5))
plt.plot(pdf[x_col[level]], pdf["maxTemperatur"], "red", label="Max")
plt.plot(pdf[x_col[level]], pdf["avgTemperatur"], "green", label="Avg")
plt.plot(pdf[x_col[level]], pdf["minTemperatur"], "blue", label="Min")
plt.title(f"{level.capitalize()}werte")
plt.legend()
plt.grid(True)
plt.show()
# --- Aufgabe B: Tempmonat and Ranking ---
def create_tempmonat(spark: SparkSession):
"""Joins stations and data to create monthly aggregates using a CTE."""
query = """
WITH base_data AS (
SELECT
d.stationId,
gs.station_name AS stationsname,
d.TT_TU,
TO_DATE(SUBSTR(d.date, 1, 4), 'yyyy') AS jahr_val,
TO_DATE(CONCAT(SUBSTR(d.date, 1, 4), '-', SUBSTR(d.date, 5, 2), '-01')) AS monat_val
FROM german_stations_data d
JOIN german_stations gs ON d.stationId = gs.stationId
WHERE d.TT_TU IS NOT NULL AND d.TT_TU <> -999
)
SELECT
stationId,
stationsname,
MIN(TT_TU) AS minTemperatur,
MAX(TT_TU) AS maxTemperatur,
AVG(TT_TU) AS avgTemperatur,
jahr_val AS jahr,
monat_val AS monat
FROM base_data
GROUP BY stationId, stationsname, jahr_val, monat_val
"""
spark.sql(query).cache().createOrReplaceTempView("tempmonat")
def rank_temperatures(spark: SparkSession, limit: int, year: int = None):
"""Musterlösung 'rankMinMaxAvgTemp2015' and 'rankMinMaxAvgTempYears'."""
where_clause = f"WHERE YEAR(jahr) = {year}" if year else ""
query = f"""
SELECT stationid, stationsname, monat, minTemperatur,
RANK() OVER (ORDER BY minTemperatur ASC) AS rangMIN,
maxTemperatur,
RANK() OVER (ORDER BY maxTemperatur DESC) AS rangMAX,
avgTemperatur,
RANK() OVER (ORDER BY avgTemperatur DESC) AS rangAVG
FROM tempmonat
{where_clause}
ORDER BY rangMIN
"""
spark.sql(query).show(limit, truncate=False)
# --- Aufgabe C: Grouping Sets ---
def create_grouping_sets_view(spark: SparkSession):
"""Computes grouping sets using a CTE to avoid Missing Aggregation errors."""
query = """
WITH base_gs AS (
SELECT
d.stationId,
gs.bundesland,
d.TT_TU,
YEAR(TO_DATE(d.date, 'yyyyMMdd')) AS jahr_val,
CONCAT(SUBSTR(d.date, 7, 2), '-', SUBSTR(d.date, 5, 2)) AS monat_val
FROM german_stations_data d
JOIN german_stations gs ON d.stationId = gs.stationId
WHERE d.TT_TU IS NOT NULL AND d.TT_TU <> -999
)
SELECT
stationId,
bundesland,
jahr_val AS jahr,
monat_val AS monat,
MIN(TT_TU) AS minTemperatur,
MAX(TT_TU) AS maxTemperatur,
AVG(TT_TU) AS avgTemperatur
FROM base_gs
GROUP BY GROUPING SETS (
(bundesland, jahr_val),
(stationId, jahr_val),
(bundesland, monat_val)
)
"""
spark.sql(query).cache().createOrReplaceTempView("tempmma_gs")
def show_seperate_gs(spark: SparkSession, limit: int, metric: str):
"""Musterlösung 'showMinMaxAvgSeperate'."""
aggs = [
("bundesland", "jahr"),
("stationId", "jahr"),
("bundesland", "monat")
]
for col1, col2 in aggs:
print(f"Aggregation: {col1} & {col2}")
q = f"SELECT {col1}, {col2}, {metric} FROM tempmma_gs WHERE {col1} IS NOT NULL AND {col2} IS NOT NULL ORDER BY {metric}"
spark.sql(q).show(limit, truncate=False)
# --- Execution ---
def main(scon, spark):
read_parquet_tables(spark)
# Kempten ID = 2559
create_mma_rollup(spark, 2559)
for level in ["years", "quartals", "months", "days"]:
plot_date_values(spark, level)
create_tempmonat(spark)
print("Rangfolgen 2015:")
rank_temperatures(spark, 18, 2015)
print("Rangfolgen Gesamt:")
rank_temperatures(spark, 18)
create_grouping_sets_view(spark)
show_seperate_gs(spark, 5, "minTemperatur")
if __name__ == "__main__":
main(scon, spark)