This commit is contained in:
2025-12-11 20:55:44 +01:00
parent d18e9823e5
commit a0ac5dbf36
4 changed files with 394 additions and 330 deletions

View File

@@ -5,7 +5,6 @@ import matplotlib.pyplot as plt
HDFSPATH = "hdfs://193.174.205.250:54310/" HDFSPATH = "hdfs://193.174.205.250:54310/"
def read_parquets(spark: SparkSession): def read_parquets(spark: SparkSession):
stations_path = HDFSPATH + "home/heiserervalentin/german_stations.parquet" stations_path = HDFSPATH + "home/heiserervalentin/german_stations.parquet"
products_path = HDFSPATH + "home/heiserervalentin/german_stations_data.parquet" products_path = HDFSPATH + "home/heiserervalentin/german_stations_data.parquet"

View File

@@ -1,366 +1,219 @@
from __future__ import annotations
from sparkstart import scon, spark from sparkstart import scon, spark
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
import time
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd
HDFSPATH_STATIONS = "hdfs://193.174.205.250:54310/home/heiserervalentin/" HDFSPATH = "hdfs://193.174.205.250:54310/"
HDFSPATH_STOCKS = "hdfs://193.174.205.250:54310/stocks/"
def init_view_stations(spark): def read_parquet_tables(spark: SparkSession) -> None:
"""Lädt die Stationsdaten für Aufgabe 11""" stations_path = HDFSPATH + "home/heiserervalentin/german_stations.parquet"
s_path = HDFSPATH_STATIONS + "german_stations.parquet" products_path = HDFSPATH + "home/heiserervalentin/german_stations_data.parquet"
d_path = HDFSPATH_STATIONS + "german_stations_data.parquet"
spark.read.parquet(s_path).createOrReplaceTempView("german_stations") spark.read.parquet(stations_path).createOrReplaceTempView("german_stations")
spark.read.parquet(d_path).createOrReplaceTempView("german_stations_data") spark.read.parquet(products_path).createOrReplaceTempView("german_stations_data")
def init_view_stocks(spark): # --- Aufgabe A ---
"""Lädt die Stocks-Daten für Aufgabe 12"""
# Hinweis: Pfade anpassen, falls sie im Cluster anders liegen
spark.read.parquet(HDFSPATH_STOCKS + "stocks.parquet").createOrReplaceTempView("stocks")
spark.read.parquet(HDFSPATH_STOCKS + "portfolio.parquet").createOrReplaceTempView("portfolio")
# --------------------------------------------------------- def create_mma_rollup(spark: SparkSession, station_id: int):
# AUFGABE 11 query = f"""
# --------------------------------------------------------- WITH processed_data AS (
def task_11a_rollup(spark: SparkSession, station_name="Kempten"):
print(f"\n--- Aufgabe 11a: Rollup & Plotting für {station_name} ---")
start = time.time()
# 1. Station ID finden
sid_df = spark.sql(f"SELECT stationId FROM german_stations WHERE station_name LIKE '%{station_name}%'")
try:
sid = sid_df.collect()[0]['stationId']
except IndexError:
print(f"Station {station_name} nicht gefunden.")
return
# 2. Rollup Query vorbereiten
q_prep = f"""
SELECT SELECT
YEAR(date) as yr, TT_TU,
QUARTER(date) as qt, hour AS messtunde,
MONTH(date) as mo, TO_DATE(SUBSTR(date, 1, 4), 'yyyy') AS jahr,
DAY(date) as da, TO_DATE(CONCAT(SUBSTR(date, 1, 4), '-', LPAD(CAST(QUARTER(TO_DATE(date, 'yyyyMMdd'))*3-2 AS STRING), 2, '0'), '-01')) AS quartal,
TT_TU TO_DATE(CONCAT(SUBSTR(date, 1, 4), '-', SUBSTR(date, 5, 2), '-01')) AS monat,
TO_DATE(date, 'yyyyMMdd') AS tag
FROM german_stations_data FROM german_stations_data
WHERE stationId = {sid} AND TT_TU IS NOT NULL AND TT_TU > -50 WHERE stationId = {station_id}
""" AND TT_TU IS NOT NULL
spark.sql(q_prep).createOrReplaceTempView("data_prep") AND TT_TU <> -999
)
q_rollup = """
SELECT SELECT
yr, qt, mo, da, MIN(TT_TU) AS minTemperatur,
MIN(TT_TU) as min_temp, MAX(TT_TU) AS maxTemperatur,
MAX(TT_TU) as max_temp, AVG(TT_TU) AS avgTemperatur,
AVG(TT_TU) as avg_temp, jahr,
quartal,
-- Datums-Konstruktion für Plots monat,
DATE(STRING(yr) || '-' || STRING(qt*3-2) || '-01') as qt_date, tag,
MAKE_DATE(yr, mo, 1) as mo_date, messtunde
MAKE_DATE(yr, 1, 1) as yr_date, FROM processed_data
MAKE_DATE(yr, mo, da) as da_date GROUP BY ROLLUP (jahr, quartal, monat, tag, messtunde)
ORDER BY jahr, quartal, monat, tag, messtunde
FROM data_prep
GROUP BY ROLLUP(yr, qt, mo, da)
""" """
df = spark.sql(query)
df.createOrReplaceTempView("mmacdcdata")
df.cache()
df.show(10)
df_rollup = spark.sql(q_rollup) def plot_date_values(spark: SparkSession, level: str):
df_rollup.cache() filters = {
df_rollup.createOrReplaceTempView("station_rollup") "days": "YEAR(jahr) > 2017 AND YEAR(jahr) < 2021 AND messtunde IS NULL AND tag IS NOT NULL",
"months": "YEAR(jahr) > 1999 AND YEAR(jahr) < 2021 AND tag IS NULL AND monat IS NOT NULL",
"quartals": "YEAR(jahr) > 1999 AND YEAR(jahr) < 2021 AND tag IS NULL AND monat IS NULL AND quartal IS NOT NULL",
"years": "YEAR(jahr) > 1999 AND YEAR(jahr) < 2021 AND tag IS NULL AND monat IS NULL AND quartal IS NULL AND jahr IS NOT NULL"
}
x_col = {"days": "tag", "months": "monat", "quartals": "quartal", "years": "jahr"}
# Trigger Action for Cache & Time Measurement pdf = spark.sql(f"SELECT * FROM mmacdcdata WHERE {filters[level]}").toPandas()
count = df_rollup.count() if pdf.empty: return
print(f"Rollup berechnet. Zeilen: {count}. Dauer: {time.time() - start:.2f}s")
input(">> 11a: Check Spark UI (Stages/Storage) jetzt. Enter für Plots...")
# Plot 1: Tageswerte (letzte 3 Jahre der Daten) plt.figure(figsize=(10, 5))
q_days = """ plt.plot(pdf[x_col[level]], pdf["maxTemperatur"], "red", label="Max")
SELECT da_date as date, avg_temp plt.plot(pdf[x_col[level]], pdf["avgTemperatur"], "green", label="Avg")
FROM station_rollup plt.plot(pdf[x_col[level]], pdf["minTemperatur"], "blue", label="Min")
WHERE yr IS NOT NULL AND qt IS NOT NULL AND mo IS NOT NULL AND da IS NOT NULL plt.title(f"{level.capitalize()}")
AND yr >= (SELECT MAX(yr) - 2 FROM station_rollup)
ORDER BY date
"""
pdf_days = spark.sql(q_days).toPandas()
plt.figure(1, figsize=(10, 5))
plt.plot(pdf_days['date'], pdf_days['avg_temp'], label='Daily Avg', linewidth=0.5)
plt.title(f"{station_name}: Daily Average (Last 3 Years)")
plt.xlabel('Date')
plt.ylabel('Temp °C')
plt.tight_layout()
plt.show()
# Plot 2: Monatswerte (10-20 Jahre)
q_months = """
SELECT mo_date as date, avg_temp
FROM station_rollup
WHERE yr IS NOT NULL AND qt IS NOT NULL AND mo IS NOT NULL AND da IS NULL
AND yr >= (SELECT MAX(yr) - 20 FROM station_rollup)
ORDER BY date
"""
pdf_months = spark.sql(q_months).toPandas()
plt.figure(2, figsize=(10, 5))
plt.plot(pdf_months['date'], pdf_months['avg_temp'], color='green', label='Monthly Avg')
plt.title(f"{station_name}: Monthly Average (Last 20 Years)")
plt.xlabel('Date')
plt.ylabel('Temp °C')
plt.tight_layout()
plt.show()
# Plot 3: Quartalswerte
q_quarters = """
SELECT qt_date as date, avg_temp
FROM station_rollup
WHERE yr IS NOT NULL AND qt IS NOT NULL AND mo IS NULL AND da IS NULL
AND yr >= (SELECT MAX(yr) - 20 FROM station_rollup)
ORDER BY date
"""
pdf_quarters = spark.sql(q_quarters).toPandas()
plt.figure(3, figsize=(10, 5))
plt.plot(pdf_quarters['date'], pdf_quarters['avg_temp'], color='orange', marker='o', linestyle='-', label='Quarterly Avg')
plt.title(f"{station_name}: Quarterly Average (Last 20 Years)")
plt.show()
# Plot 4: Jahreswerte
q_years = """
SELECT yr_date as date, min_temp, max_temp, avg_temp
FROM station_rollup
WHERE yr IS NOT NULL AND qt IS NULL AND mo IS NULL AND da IS NULL
AND yr >= (SELECT MAX(yr) - 20 FROM station_rollup)
ORDER BY date
"""
pdf_years = spark.sql(q_years).toPandas()
plt.figure(4, figsize=(10, 5))
plt.plot(pdf_years['date'], pdf_years['max_temp'], color='red', label='Max')
plt.plot(pdf_years['date'], pdf_years['avg_temp'], color='black', label='Avg')
plt.plot(pdf_years['date'], pdf_years['min_temp'], color='blue', label='Min')
plt.title(f"{station_name}: Yearly Aggregates (Last 20 Years)")
plt.legend() plt.legend()
plt.grid(True)
plt.show() plt.show()
# --- Aufgabe B ---
def task_11b_rank(spark: SparkSession): def create_tempmonat(spark: SparkSession):
print("\n--- Aufgabe 11b: TempMonat Ranking ---") query = """
start = time.time() WITH base_data AS (
q_tempmonat = """
SELECT SELECT
d.stationId, d.stationId,
s.station_name, gs.station_name AS stationsname,
SUBSTR(CAST(d.date AS STRING), 1, 4) as year, d.TT_TU,
SUBSTR(CAST(d.date AS STRING), 6, 2) as month, TO_DATE(SUBSTR(d.date, 1, 4), 'yyyy') AS jahr_val,
MIN(d.TT_TU) as min_t, TO_DATE(CONCAT(SUBSTR(d.date, 1, 4), '-', SUBSTR(d.date, 5, 2), '-01')) AS monat_val
MAX(d.TT_TU) as max_t,
AVG(d.TT_TU) as avg_t
FROM german_stations_data d FROM german_stations_data d
JOIN german_stations s ON d.stationId = s.stationId JOIN german_stations gs ON d.stationId = gs.stationId
WHERE d.TT_TU IS NOT NULL AND d.TT_TU > -50 WHERE d.TT_TU IS NOT NULL AND d.TT_TU <> -999
GROUP BY d.stationId, s.station_name, year, month )
"""
df_tm = spark.sql(q_tempmonat)
df_tm.createOrReplaceTempView("tempmonat")
# 1. Ranking Partitioniert nach Monat im Jahr 2015
print(" > Berechne Ranking für 2015 (partitioniert nach Monat)...")
q_rank_2015 = """
SELECT SELECT
month, station_name, min_t, stationId,
RANK() OVER (PARTITION BY month ORDER BY min_t ASC) as rank_min, stationsname,
RANK() OVER (PARTITION BY month ORDER BY max_t ASC) as rank_max, MIN(TT_TU) AS minTemperatur,
RANK() OVER (PARTITION BY month ORDER BY avg_t ASC) as rank_avg MAX(TT_TU) AS maxTemperatur,
AVG(TT_TU) AS avgTemperatur,
jahr_val AS jahr,
monat_val AS monat
FROM base_data
GROUP BY stationId, stationsname, jahr_val, monat_val
"""
spark.sql(query).cache().createOrReplaceTempView("tempmonat")
def rank_temperatures(spark: SparkSession, limit: int, year: int = None):
where_clause = f"WHERE YEAR(jahr) = {year}" if year else ""
query = f"""
SELECT stationid, stationsname, monat, minTemperatur,
RANK() OVER (ORDER BY minTemperatur ASC) AS rangMIN,
maxTemperatur,
RANK() OVER (ORDER BY maxTemperatur DESC) AS rangMAX,
avgTemperatur,
RANK() OVER (ORDER BY avgTemperatur DESC) AS rangAVG
FROM tempmonat FROM tempmonat
WHERE year = '2015' {where_clause}
ORDER BY rank_min, month ORDER BY rangMIN
""" """
spark.sql(q_rank_2015).show(10) spark.sql(query).show(limit, truncate=False)
# 2. Globales Ranking (über alle Monate/Jahre hinweg) # --- Aufgabe C ---
print(" > Berechne Ranking global (kälteste Monate aller Zeiten)...")
q_rank_global = """ def create_grouping_sets_view(spark: SparkSession):
query = """
WITH base_gs AS (
SELECT SELECT
year, month, station_name, min_t, d.stationId,
RANK() OVER (ORDER BY min_t ASC) as rank_min, -- TRIM entfernt Leerzeichen für saubere Tabellen
RANK() OVER (ORDER BY max_t ASC) as rank_max, TRIM(gs.bundesland) AS bundesland_clean,
RANK() OVER (ORDER BY avg_t ASC) as rank_avg d.TT_TU,
FROM tempmonat -- Extrahiere Jahr und Kalendermonat (1-12)
ORDER BY rank_min YEAR(TO_DATE(d.date, 'yyyyMMdd')) AS jahr_val,
""" MONTH(TO_DATE(d.date, 'yyyyMMdd')) AS monat_val
spark.sql(q_rank_global).show(10)
print(f"Dauer 11b: {time.time() - start:.2f}s")
input(">> 11b: Check Spark UI (Jobs/Stages). Enter...")
def task_11c_groupingsets(spark: SparkSession):
print("\n--- Aufgabe 11c: Grouping Sets ---")
start = time.time()
q_prep = """
SELECT
CAST(SUBSTR(CAST(d.date AS STRING), 1, 4) AS INT) as year,
CAST(SUBSTR(CAST(d.date AS STRING), 6, 2) AS INT) as month,
s.station_name,
s.bundesland,
d.TT_TU
FROM german_stations_data d FROM german_stations_data d
JOIN german_stations s ON d.stationId = s.stationId JOIN german_stations gs ON d.stationId = gs.stationId
WHERE d.TT_TU > -50 WHERE d.TT_TU IS NOT NULL AND d.TT_TU <> -999
""" )
spark.sql(q_prep).createOrReplaceTempView("gs_base")
q_sets = """
SELECT SELECT
year, stationId,
month, bundesland_clean AS bundesland,
bundesland, jahr_val AS jahr,
station_name, monat_val AS monat,
MIN(TT_TU) as min_t, MAX(TT_TU) as max_t, AVG(TT_TU) as avg_t MIN(TT_TU) AS minTemperatur,
FROM gs_base MAX(TT_TU) AS maxTemperatur,
AVG(TT_TU) AS avgTemperatur
FROM base_gs
GROUP BY GROUPING SETS ( GROUP BY GROUPING SETS (
(year, bundesland), (bundesland_clean, jahr_val), -- 1. Jahr und Bundesland
(year, station_name), (stationId, jahr_val), -- 2. Jahr und Station
(month, bundesland) (bundesland_clean, monat_val) -- 3. Monat und Bundesland
) )
""" """
df_gs = spark.sql(q_sets) df = spark.sql(query)
df_gs.cache()
df_gs.createOrReplaceTempView("grouping_result")
# Action zum Cachen df.cache()
df_gs.count()
print(f"Grouping Sets berechnet. Dauer: {time.time() - start:.2f}s")
print("Auswahl 1: Jahr & Bundesland") df.createOrReplaceTempView("tempmma_gs")
spark.sql("SELECT year, bundesland, avg_t FROM grouping_result WHERE station_name IS NULL AND month IS NULL ORDER BY year DESC, bundesland").show(5)
print("Auswahl 2: Jahr & Station")
spark.sql("SELECT year, station_name, avg_t FROM grouping_result WHERE bundesland IS NULL AND month IS NULL ORDER BY year DESC, station_name").show(5)
print("Auswahl 3: Monat & Bundesland (Jahreszeitlicher Verlauf je Land)")
spark.sql("SELECT month, bundesland, avg_t FROM grouping_result WHERE year IS NULL AND station_name IS NULL ORDER BY bundesland, month").show(5)
input(">> 11c: Check Spark UI (Zugriffspläne/Storage). Enter...")
# --------------------------------------------------------- def show_seperate_gs(spark: SparkSession, limit: int):
# AUFGABE 12
# ---------------------------------------------------------
def task_12_stocks_analysis(spark: SparkSession): # Filter: stationId muss NULL sein, monat muss NULL sein
print("\n--- Aufgabe 12: Stocks & Portfolio ---") spark.sql("""
# a) Erstes und letztes Datum je Symbol
print("a) Min/Max Datum pro Symbol")
t0 = time.time()
q_a = """
SELECT symbol, MIN(date) as first_date, MAX(date) as last_date
FROM stocks
GROUP BY symbol
ORDER BY symbol
"""
spark.sql(q_a).show(5)
print(f"Zeit a): {time.time()-t0:.2f}s")
# b) Aggregationen 2009
print("\nb) High/Low/Avg Close 2009")
t0 = time.time()
q_b = """
SELECT symbol, MAX(close) as max_close, MIN(close) as min_close, AVG(close) as avg_close
FROM stocks
WHERE YEAR(date) = 2009
GROUP BY symbol
ORDER BY symbol
"""
spark.sql(q_b).show(5)
print(f"Zeit b): {time.time()-t0:.2f}s")
# c) Lateral View (Explode Portfolio)
print("\nc) Lateral View: Aktien in Portfolios")
t0 = time.time()
q_c = """
SELECT SELECT
h.symbol, jahr,
SUM(h.amount) as total_shares, bundesland,
COUNT(p.portfolioId) as num_portfolios, minTemperatur,
AVG(h.amount) as avg_per_portfolio maxTemperatur,
FROM portfolio p ROUND(avgTemperatur, 2) as avgTemperatur
LATERAL VIEW explode(holdings) t AS h FROM tempmma_gs
GROUP BY h.symbol WHERE bundesland IS NOT NULL
ORDER BY h.symbol AND jahr IS NOT NULL
""" AND stationId IS NULL
spark.sql(q_c).show(5) AND monat IS NULL
print(f"Zeit c): {time.time()-t0:.2f}s") ORDER BY jahr DESC, bundesland ASC
""").show(limit, truncate=False)
# d) Symbole in keinem Portfolio (Anti Join) # Filter: bundesland muss NULL sein, monat muss NULL sein
print("\nd) Symbole ohne Portfolio") spark.sql("""
t0 = time.time()
q_d = """
SELECT DISTINCT s.symbol
FROM stocks s
LEFT ANTI JOIN (
SELECT DISTINCT h.symbol
FROM portfolio p
LATERAL VIEW explode(holdings) t AS h
) p_sym ON s.symbol = p_sym.symbol
ORDER BY s.symbol
"""
spark.sql(q_d).show(5)
print(f"Zeit d): {time.time()-t0:.2f}s")
input(">> 12 a-d fertig. Check UI. Enter für e)...")
# e) Portfolio Wert Ende 2010
print("\ne) Portfolio Bewertung Ende 2010")
t0 = time.time()
q_last_price = """
SELECT symbol, close
FROM (
SELECT SELECT
symbol, jahr,
close, stationId,
ROW_NUMBER() OVER (PARTITION BY symbol ORDER BY date DESC) as rn minTemperatur,
FROM stocks maxTemperatur,
WHERE YEAR(date) = 2010 ROUND(avgTemperatur, 2) as avgTemperatur
) tmp FROM tempmma_gs
WHERE rn = 1 WHERE stationId IS NOT NULL
""" AND jahr IS NOT NULL
spark.sql(q_last_price).createOrReplaceTempView("stocks_2010_end") AND bundesland IS NULL
ORDER BY jahr DESC, stationId ASC
""").show(limit, truncate=False)
# Schritt 2: Portfolio explodieren, mit Preis joinen, berechnen, summieren # Filter: stationId muss NULL sein, jahr muss NULL sein
q_val = """ spark.sql("""
SELECT SELECT
p.portfolioId, monat,
SUM(h.amount * s.close) as portfolio_value_2010 bundesland,
FROM portfolio p minTemperatur,
LATERAL VIEW explode(holdings) t AS h maxTemperatur,
JOIN stocks_2010_end s ON h.symbol = s.symbol ROUND(avgTemperatur, 2) as avgTemperatur
GROUP BY p.portfolioId FROM tempmma_gs
ORDER BY p.portfolioId WHERE bundesland IS NOT NULL
""" AND monat IS NOT NULL
spark.sql(q_val).show(5) AND jahr IS NULL
print(f"Zeit e): {time.time()-t0:.2f}s") ORDER BY monat ASC, bundesland ASC
""").show(limit, truncate=False)
def main(scon, spark): def main(scon, spark):
init_view_stations(spark) read_parquet_tables(spark)
# Aufgabe 11 # Kempten ID = 2559
task_11a_rollup(spark, station_name="Kempten") create_mma_rollup(spark, 2559)
task_11b_rank(spark) for level in ["years", "quartals", "months", "days"]:
task_11c_groupingsets(spark) plot_date_values(spark, level)
# Aufgabe 12 create_tempmonat(spark)
init_view_stocks(spark) print("Rangfolgen 2015:")
task_12_stocks_analysis(spark) rank_temperatures(spark, 18, 2015)
print("Rangfolgen Gesamt:")
rank_temperatures(spark, 18)
if __name__ == '__main__': create_grouping_sets_view(spark)
show_seperate_gs(spark, 10)
if __name__ == "__main__":
main(scon, spark) main(scon, spark)

190
Aufgabe 12/Aufgabe12.py Normal file
View File

@@ -0,0 +1,190 @@
from sparkstart import spark
from sparkstart import scon, spark
from pyspark.sql import SparkSession
HDFSPATH = "hdfs://193.174.205.250:54310/"
SOURCEPATH = HDFSPATH + "stocks/"
def read_data(spark: SparkSession) -> None:
"""
Loads the existing Parquet files from HDFS into Spark Views.
"""
print(f"--- Loading Views from {SOURCEPATH} ---")
try:
# Load Stocks
spark.read.parquet(SOURCEPATH + "stocks.parquet").createOrReplaceTempView("stocks")
print("-> View 'stocks' loaded.")
# Load Portfolio
spark.read.parquet(SOURCEPATH + "portfolio.parquet").createOrReplaceTempView("portfolio")
print("-> View 'portfolio' loaded.")
except Exception as e:
print(f"CRITICAL ERROR: Could not load data. {e}")
print("Please check if the path exists in HDFS.")
# --- Aufgabe A ---
def first_last_quotation(spark: SparkSession, num: int = 10) -> None:
print("\n--- Aufgabe A: First/Last Quotation ---")
query = """
SELECT symbol,
MIN(dt) AS altNotierung,
MAX(dt) AS neuNotierung
FROM stocks
GROUP BY symbol
ORDER BY symbol
"""
df_quotation = spark.sql(query)
df_quotation.show(num, truncate=False)
df_quotation.write.mode('overwrite').parquet(HDFSPATH + "home/heiserervalentin/nyse1.parquet")
print("-> Imported nyse1")
# --- Aufgabe B ---
def min_max_avg_close(spark: SparkSession, num: int = 10) -> None:
print("\n--- Aufgabe B: Min/Max/Avg Close 2009 ---")
query = """
SELECT symbol,
MIN(close) AS minClose,
MAX(close) AS maxClose,
AVG(close) AS avgClose
FROM stocks
WHERE YEAR(dt) = 2009
GROUP BY symbol
ORDER BY symbol
"""
df_close = spark.sql(query)
df_close.show(num, truncate=False)
df_close.write.mode('overwrite').parquet(HDFSPATH + "home/heiserervalentin/nyse2.parquet")
print("-> Imported nyse2")
# --- Aufgabe C ---
def sum_count_avg_portfolios(spark: SparkSession, num: int = 10) -> None:
print("\n--- Aufgabe C: Portfolio Aggregations ---")
# 1. Explode
query_explode = """
SELECT pid, Attr
FROM portfolio
LATERAL VIEW EXPLODE(bonds) AS Attr
"""
df_temp = spark.sql(query_explode)
df_temp.createOrReplaceTempView("temp")
# 2. Aggregate
query_agg = """
SELECT Attr.symbol AS symbol,
COUNT(pid) AS anzpid,
SUM(Attr.num) AS anzAktien,
AVG(Attr.num) AS avgAnzAktien
FROM temp
GROUP BY symbol
ORDER BY symbol
"""
df_sum_sel_cnt_avg = spark.sql(query_agg)
df_sum_sel_cnt_avg.show(num, truncate=False)
df_sum_sel_cnt_avg.write.mode('overwrite').parquet(HDFSPATH + "home/heiserervalentin/nyse3.parquet")
print("-> Imported nyse3")
# --- Aufgabe D ---
def symbols_not_in_portfolio(spark: SparkSession, num: int = 10) -> None:
print("\n--- Aufgabe D: Symbols not in Portfolio ---")
query_explode = """
SELECT Attr
FROM portfolio
LATERAL VIEW EXPLODE(bonds) AS Attr
"""
df_temp = spark.sql(query_explode)
df_temp.createOrReplaceTempView("tempport")
query_distinct = """
SELECT DISTINCT s.symbol
FROM stocks s
LEFT OUTER JOIN tempport p ON s.symbol = p.Attr.symbol
WHERE p.Attr.symbol IS NULL
ORDER BY s.symbol
"""
df_symbols = spark.sql(query_distinct)
df_symbols.show(num, truncate=False)
df_symbols.write.mode('overwrite').parquet(HDFSPATH + "home/heiserervalentin/nyse4.parquet")
print("-> Imported nyse4")
# --- Aufgabe E ---
def value_portfolio_2010(spark: SparkSession, num: int = 10) -> None:
print("\n--- Aufgabe E: Portfolio Value 2010 ---")
# 1. Portfolio explodieren
query_portfolio = """
SELECT pid, Attr.symbol AS symbol, Attr.num AS anzAktien
FROM portfolio
LATERAL VIEW EXPLODE(bonds) AS Attr
ORDER BY pid
"""
df_lview = spark.sql(query_portfolio)
df_lview.createOrReplaceTempView("tempportfolio")
# df_lview.show(num, truncate=False) # Optional zur Kontrolle
# 2. Stocks filtern (Neuester Kurs in 2010)
query_stocks = """
SELECT s.symbol, s.dt, s.close
FROM stocks s
INNER JOIN (
SELECT symbol, MAX(dt) AS datum
FROM stocks
GROUP BY symbol
) AS grpStocks
ON s.symbol = grpStocks.symbol AND s.dt = grpStocks.datum
WHERE YEAR(dt) = 2010
ORDER BY datum
"""
df_2010 = spark.sql(query_stocks)
df_2010.createOrReplaceTempView("tempstocks")
# df_2010.show(num, truncate=False) # Optional zur Kontrolle
# 3. Wert berechnen (Join)
query_value = """
SELECT p.*, s.close * p.anzAktien AS wert
FROM tempportfolio p, tempstocks s
WHERE s.symbol = p.symbol
ORDER BY p.pid
"""
df_value = spark.sql(query_value)
df_value.createOrReplaceTempView("tempvalue")
# df_value.show(num, truncate=False) # Optional zur Kontrolle
# 4. Gesamtwert aggregieren
query_sum = """
SELECT pid, SUM(wert) AS gesamtwert
FROM tempvalue
GROUP BY pid
ORDER BY pid
"""
df_sum = spark.sql(query_sum)
df_sum.show(num, truncate=False)
df_sum.write.mode('overwrite').parquet(HDFSPATH + "home/heiserervalentin/nyse5.parquet")
print("-> Imported nyse5")
def main(scon, spark):
read_data(spark)
first_last_quotation(spark, 10)
min_max_avg_close(spark, 10)
sum_count_avg_portfolios(spark, 5)
symbols_not_in_portfolio(spark, 5)
value_portfolio_2010(spark, 10)
if __name__ == "__main__":
main(scon, spark)

22
Aufgabe 12/sparkstart.py Normal file
View File

@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
"""
Erzeugen einer Spark-Konfiguration
"""
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
# connect to cluster
conf = SparkConf().setMaster("spark://193.174.205.250:7077").setAppName("HeisererValentin")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.executor.memory", '32g')
conf.set("spark.driver.memory", '8g')
conf.set("spark.cores.max", "40")
scon = SparkContext(conf=conf)
spark = SparkSession \
.builder \
.appName("Python Spark SQL") \
.getOrCreate()