mirror of
https://github.com/Vale54321/BigData.git
synced 2025-12-15 19:29:34 +01:00
Aufgabe 10
This commit is contained in:
@@ -62,7 +62,7 @@ def compute_daily_and_yearly_frosts(spark: SparkSession):
|
|||||||
q_daily_max = (
|
q_daily_max = (
|
||||||
"SELECT stationId, date, SUBSTR(CAST(date AS STRING),1,4) AS year, MAX(TT_TU) AS max_temp "
|
"SELECT stationId, date, SUBSTR(CAST(date AS STRING),1,4) AS year, MAX(TT_TU) AS max_temp "
|
||||||
"FROM german_stations_data "
|
"FROM german_stations_data "
|
||||||
"WHERE TT_TU IS NOT NULL "
|
"WHERE TT_TU IS NOT NULL AND TT_TU > -50 AND TT_TU < 60 "
|
||||||
"GROUP BY stationId, date"
|
"GROUP BY stationId, date"
|
||||||
)
|
)
|
||||||
daily_max = spark.sql(q_daily_max)
|
daily_max = spark.sql(q_daily_max)
|
||||||
@@ -88,9 +88,6 @@ def compute_daily_and_yearly_frosts(spark: SparkSession):
|
|||||||
def frost_analysis(spark: SparkSession, year=2024, station_name_matches=('kempten',)):
|
def frost_analysis(spark: SparkSession, year=2024, station_name_matches=('kempten',)):
|
||||||
compute_daily_and_yearly_frosts(spark)
|
compute_daily_and_yearly_frosts(spark)
|
||||||
|
|
||||||
# Debug: check available years and data
|
|
||||||
spark.sql("SELECT year, COUNT(*) as cnt FROM station_year_frost GROUP BY year ORDER BY year").show(50)
|
|
||||||
|
|
||||||
q_hist = (
|
q_hist = (
|
||||||
f"SELECT frost_days, COUNT(*) AS station_count "
|
f"SELECT frost_days, COUNT(*) AS station_count "
|
||||||
f"FROM station_year_frost WHERE year = '{year}' GROUP BY frost_days ORDER BY frost_days"
|
f"FROM station_year_frost WHERE year = '{year}' GROUP BY frost_days ORDER BY frost_days"
|
||||||
@@ -101,7 +98,6 @@ def frost_analysis(spark: SparkSession, year=2024, station_name_matches=('kempte
|
|||||||
|
|
||||||
if hist_pdf.empty:
|
if hist_pdf.empty:
|
||||||
print(f"No frost data found for year {year}. Trying to find available years...")
|
print(f"No frost data found for year {year}. Trying to find available years...")
|
||||||
# Try without year filter to see if data exists
|
|
||||||
q_all = "SELECT frost_days, COUNT(*) AS station_count FROM station_year_frost GROUP BY frost_days ORDER BY frost_days"
|
q_all = "SELECT frost_days, COUNT(*) AS station_count FROM station_year_frost GROUP BY frost_days ORDER BY frost_days"
|
||||||
hist_pdf = spark.sql(q_all).toPandas()
|
hist_pdf = spark.sql(q_all).toPandas()
|
||||||
if hist_pdf.empty:
|
if hist_pdf.empty:
|
||||||
@@ -129,11 +125,10 @@ def frost_analysis(spark: SparkSession, year=2024, station_name_matches=('kempte
|
|||||||
sname = r['station_name']
|
sname = r['station_name']
|
||||||
print(f"Analyzing stationId={sid} name={sname}")
|
print(f"Analyzing stationId={sid} name={sname}")
|
||||||
|
|
||||||
# compute frostdays + 5-yr and 20-yr rolling averages using window frame
|
|
||||||
q_ts = (
|
q_ts = (
|
||||||
"SELECT year, frost_days, "
|
"SELECT year, frost_days, "
|
||||||
"AVG(frost_days) OVER (PARTITION BY stationId ORDER BY CAST(year AS INT) ROWS BETWEEN 4 PRECEDING AND CURRENT ROW) AS avg_5, "
|
"AVG(frost_days) OVER (PARTITION BY stationId ORDER BY CAST(year AS INT) RANGE BETWEEN 4 PRECEDING AND CURRENT ROW) AS avg_5, "
|
||||||
"AVG(frost_days) OVER (PARTITION BY stationId ORDER BY CAST(year AS INT) ROWS BETWEEN 19 PRECEDING AND CURRENT ROW) AS avg_20 "
|
"AVG(frost_days) OVER (PARTITION BY stationId ORDER BY CAST(year AS INT) RANGE BETWEEN 19 PRECEDING AND CURRENT ROW) AS avg_20 "
|
||||||
f"FROM station_year_frost WHERE stationId = {sid} ORDER BY CAST(year AS INT)"
|
f"FROM station_year_frost WHERE stationId = {sid} ORDER BY CAST(year AS INT)"
|
||||||
)
|
)
|
||||||
ts_df = spark.sql(q_ts)
|
ts_df = spark.sql(q_ts)
|
||||||
|
|||||||
Reference in New Issue
Block a user