mirror of
https://github.com/Vale54321/BigData.git
synced 2026-02-04 00:35:55 +01:00
187 lines
6.6 KiB
Python
187 lines
6.6 KiB
Python
from __future__ import annotations
|
|
from sparkstart import scon, spark
|
|
from pyspark.sql import SparkSession
|
|
import matplotlib.pyplot as plt
|
|
|
|
HDFSPATH = "hdfs://193.174.205.250:54310/"
|
|
|
|
def read_parquet_tables(spark: SparkSession) -> None:
|
|
# Use your specific paths from Aufgabe 9
|
|
stations_path = HDFSPATH + "home/heiserervalentin/german_stations.parquet"
|
|
products_path = HDFSPATH + "home/heiserervalentin/german_stations_data.parquet"
|
|
|
|
spark.read.parquet(stations_path).createOrReplaceTempView("german_stations")
|
|
spark.read.parquet(products_path).createOrReplaceTempView("german_stations_data")
|
|
|
|
# --- Aufgabe A: Rollup and Plotting ---
|
|
|
|
def create_mma_rollup(spark: SparkSession, station_id: int):
|
|
"""
|
|
Functionally identical to Musterlösung 'createDataFrame'.
|
|
Uses your schema: TT_TU (temp), hour (messtunde), date (string yyyyMMdd).
|
|
"""
|
|
query = f"""
|
|
WITH processed_data AS (
|
|
SELECT
|
|
TT_TU,
|
|
hour AS messtunde,
|
|
TO_DATE(SUBSTR(date, 1, 4), 'yyyy') AS jahr,
|
|
TO_DATE(CONCAT(SUBSTR(date, 1, 4), '-', LPAD(CAST(QUARTER(TO_DATE(date, 'yyyyMMdd'))*3-2 AS STRING), 2, '0'), '-01')) AS quartal,
|
|
TO_DATE(CONCAT(SUBSTR(date, 1, 4), '-', SUBSTR(date, 5, 2), '-01')) AS monat,
|
|
TO_DATE(date, 'yyyyMMdd') AS tag
|
|
FROM german_stations_data
|
|
WHERE stationId = {station_id}
|
|
AND TT_TU IS NOT NULL
|
|
AND TT_TU <> -999
|
|
)
|
|
SELECT
|
|
MIN(TT_TU) AS minTemperatur,
|
|
MAX(TT_TU) AS maxTemperatur,
|
|
AVG(TT_TU) AS avgTemperatur,
|
|
jahr,
|
|
quartal,
|
|
monat,
|
|
tag,
|
|
messtunde
|
|
FROM processed_data
|
|
GROUP BY ROLLUP (jahr, quartal, monat, tag, messtunde)
|
|
ORDER BY jahr, quartal, monat, tag, messtunde
|
|
"""
|
|
df = spark.sql(query)
|
|
df.createOrReplaceTempView("mmacdcdata")
|
|
df.cache()
|
|
df.show(10)
|
|
|
|
def plot_date_values(spark: SparkSession, level: str):
|
|
"""Functionally identical plotting logic to Musterlösung 'plotDateValues'."""
|
|
filters = {
|
|
"days": "YEAR(jahr) > 2017 AND YEAR(jahr) < 2021 AND messtunde IS NULL AND tag IS NOT NULL",
|
|
"months": "YEAR(jahr) > 1999 AND YEAR(jahr) < 2021 AND tag IS NULL AND monat IS NOT NULL",
|
|
"quartals": "YEAR(jahr) > 1999 AND YEAR(jahr) < 2021 AND tag IS NULL AND monat IS NULL AND quartal IS NOT NULL",
|
|
"years": "YEAR(jahr) > 1999 AND YEAR(jahr) < 2021 AND tag IS NULL AND monat IS NULL AND quartal IS NULL AND jahr IS NOT NULL"
|
|
}
|
|
x_col = {"days": "tag", "months": "monat", "quartals": "quartal", "years": "jahr"}
|
|
|
|
pdf = spark.sql(f"SELECT * FROM mmacdcdata WHERE {filters[level]}").toPandas()
|
|
if pdf.empty: return
|
|
|
|
plt.figure(figsize=(10, 5))
|
|
plt.plot(pdf[x_col[level]], pdf["maxTemperatur"], "red", label="Max")
|
|
plt.plot(pdf[x_col[level]], pdf["avgTemperatur"], "green", label="Avg")
|
|
plt.plot(pdf[x_col[level]], pdf["minTemperatur"], "blue", label="Min")
|
|
plt.title(f"{level.capitalize()}werte")
|
|
plt.legend()
|
|
plt.grid(True)
|
|
plt.show()
|
|
|
|
# --- Aufgabe B: Tempmonat and Ranking ---
|
|
|
|
def create_tempmonat(spark: SparkSession):
|
|
"""Joins stations and data to create monthly aggregates using a CTE."""
|
|
query = """
|
|
WITH base_data AS (
|
|
SELECT
|
|
d.stationId,
|
|
gs.station_name AS stationsname,
|
|
d.TT_TU,
|
|
TO_DATE(SUBSTR(d.date, 1, 4), 'yyyy') AS jahr_val,
|
|
TO_DATE(CONCAT(SUBSTR(d.date, 1, 4), '-', SUBSTR(d.date, 5, 2), '-01')) AS monat_val
|
|
FROM german_stations_data d
|
|
JOIN german_stations gs ON d.stationId = gs.stationId
|
|
WHERE d.TT_TU IS NOT NULL AND d.TT_TU <> -999
|
|
)
|
|
SELECT
|
|
stationId,
|
|
stationsname,
|
|
MIN(TT_TU) AS minTemperatur,
|
|
MAX(TT_TU) AS maxTemperatur,
|
|
AVG(TT_TU) AS avgTemperatur,
|
|
jahr_val AS jahr,
|
|
monat_val AS monat
|
|
FROM base_data
|
|
GROUP BY stationId, stationsname, jahr_val, monat_val
|
|
"""
|
|
spark.sql(query).cache().createOrReplaceTempView("tempmonat")
|
|
|
|
def rank_temperatures(spark: SparkSession, limit: int, year: int = None):
|
|
"""Musterlösung 'rankMinMaxAvgTemp2015' and 'rankMinMaxAvgTempYears'."""
|
|
where_clause = f"WHERE YEAR(jahr) = {year}" if year else ""
|
|
query = f"""
|
|
SELECT stationid, stationsname, monat, minTemperatur,
|
|
RANK() OVER (ORDER BY minTemperatur ASC) AS rangMIN,
|
|
maxTemperatur,
|
|
RANK() OVER (ORDER BY maxTemperatur DESC) AS rangMAX,
|
|
avgTemperatur,
|
|
RANK() OVER (ORDER BY avgTemperatur DESC) AS rangAVG
|
|
FROM tempmonat
|
|
{where_clause}
|
|
ORDER BY rangMIN
|
|
"""
|
|
spark.sql(query).show(limit, truncate=False)
|
|
|
|
# --- Aufgabe C: Grouping Sets ---
|
|
|
|
def create_grouping_sets_view(spark: SparkSession):
|
|
"""Computes grouping sets using a CTE to avoid Missing Aggregation errors."""
|
|
query = """
|
|
WITH base_gs AS (
|
|
SELECT
|
|
d.stationId,
|
|
gs.bundesland,
|
|
d.TT_TU,
|
|
YEAR(TO_DATE(d.date, 'yyyyMMdd')) AS jahr_val,
|
|
CONCAT(SUBSTR(d.date, 7, 2), '-', SUBSTR(d.date, 5, 2)) AS monat_val
|
|
FROM german_stations_data d
|
|
JOIN german_stations gs ON d.stationId = gs.stationId
|
|
WHERE d.TT_TU IS NOT NULL AND d.TT_TU <> -999
|
|
)
|
|
SELECT
|
|
stationId,
|
|
bundesland,
|
|
jahr_val AS jahr,
|
|
monat_val AS monat,
|
|
MIN(TT_TU) AS minTemperatur,
|
|
MAX(TT_TU) AS maxTemperatur,
|
|
AVG(TT_TU) AS avgTemperatur
|
|
FROM base_gs
|
|
GROUP BY GROUPING SETS (
|
|
(bundesland, jahr_val),
|
|
(stationId, jahr_val),
|
|
(bundesland, monat_val)
|
|
)
|
|
"""
|
|
spark.sql(query).cache().createOrReplaceTempView("tempmma_gs")
|
|
|
|
def show_seperate_gs(spark: SparkSession, limit: int, metric: str):
|
|
"""Musterlösung 'showMinMaxAvgSeperate'."""
|
|
aggs = [
|
|
("bundesland", "jahr"),
|
|
("stationId", "jahr"),
|
|
("bundesland", "monat")
|
|
]
|
|
for col1, col2 in aggs:
|
|
print(f"Aggregation: {col1} & {col2}")
|
|
q = f"SELECT {col1}, {col2}, {metric} FROM tempmma_gs WHERE {col1} IS NOT NULL AND {col2} IS NOT NULL ORDER BY {metric}"
|
|
spark.sql(q).show(limit, truncate=False)
|
|
|
|
# --- Execution ---
|
|
|
|
def main(scon, spark):
|
|
read_parquet_tables(spark)
|
|
|
|
# Kempten ID = 2559
|
|
create_mma_rollup(spark, 2559)
|
|
for level in ["years", "quartals", "months", "days"]:
|
|
plot_date_values(spark, level)
|
|
|
|
create_tempmonat(spark)
|
|
print("Rangfolgen 2015:")
|
|
rank_temperatures(spark, 18, 2015)
|
|
print("Rangfolgen Gesamt:")
|
|
rank_temperatures(spark, 18)
|
|
|
|
create_grouping_sets_view(spark)
|
|
show_seperate_gs(spark, 5, "minTemperatur")
|
|
|
|
if __name__ == "__main__":
|
|
main(scon, spark) |