This commit is contained in:
2025-12-04 17:42:03 +01:00
parent c072850289
commit de3782d570
2 changed files with 16 additions and 2 deletions

View File

@@ -60,8 +60,9 @@ def duration_circle_size(spark: SparkSession):
def compute_daily_and_yearly_frosts(spark: SparkSession):
q_daily_max = (
"SELECT stationId, date, SUBSTR(date,1,4) AS year, MAX(TT_TU) AS max_temp "
"SELECT stationId, date, SUBSTR(CAST(date AS STRING),1,4) AS year, MAX(TT_TU) AS max_temp "
"FROM german_stations_data "
"WHERE TT_TU IS NOT NULL "
"GROUP BY stationId, date"
)
daily_max = spark.sql(q_daily_max)
@@ -87,6 +88,9 @@ def compute_daily_and_yearly_frosts(spark: SparkSession):
def frost_analysis(spark: SparkSession, year=2024, station_name_matches=('kempten',)):
compute_daily_and_yearly_frosts(spark)
# Debug: check available years and data
spark.sql("SELECT year, COUNT(*) as cnt FROM station_year_frost GROUP BY year ORDER BY year").show(50)
q_hist = (
f"SELECT frost_days, COUNT(*) AS station_count "
f"FROM station_year_frost WHERE year = '{year}' GROUP BY frost_days ORDER BY frost_days"
@@ -94,6 +98,17 @@ def frost_analysis(spark: SparkSession, year=2024, station_name_matches=('kempte
hist_df = spark.sql(q_hist)
hist_pdf = hist_df.toPandas()
if hist_pdf.empty:
print(f"No frost data found for year {year}. Trying to find available years...")
# Try without year filter to see if data exists
q_all = "SELECT frost_days, COUNT(*) AS station_count FROM station_year_frost GROUP BY frost_days ORDER BY frost_days"
hist_pdf = spark.sql(q_all).toPandas()
if hist_pdf.empty:
print("No frost data available at all. Check if TT_TU column contains valid temperature data.")
return
print(f"Found {len(hist_pdf)} frost day categories across all years")
plt.figure(figsize=(8, 5))
plt.bar(hist_pdf.frost_days, hist_pdf.station_count, color='steelblue')
plt.xlabel('Number of Frost Days in year ' + str(year))

View File

@@ -145,7 +145,6 @@ def import_produkt_files(spark: SparkSession, scon: SparkContext, path='/data/cd
)
def read_product_data_from_parquet(spark):
"""
read_product_data_from_parquet(spark)