from sparkstart import scon, spark from pyspark.sql import SparkSession import time import matplotlib.pyplot as plt import pandas as pd HDFSPATH_STATIONS = "hdfs://193.174.205.250:54310/home/heiserervalentin/" HDFSPATH_STOCKS = "hdfs://193.174.205.250:54310/stocks/" def init_view_stocks(spark): """Lädt die Stocks-Daten für Aufgabe 12""" # Hinweis: Pfade anpassen, falls sie im Cluster anders liegen spark.read.parquet(HDFSPATH_STOCKS + "stocks.parquet").createOrReplaceTempView("stocks") spark.read.parquet(HDFSPATH_STOCKS + "portfolio.parquet").createOrReplaceTempView("portfolio") # --------------------------------------------------------- # AUFGABE 12 # --------------------------------------------------------- def task_12_stocks_analysis(spark: SparkSession): print("\n--- Aufgabe 12: Stocks & Portfolio ---") # a) Erstes und letztes Datum je Symbol print("a) Min/Max Datum pro Symbol") t0 = time.time() q_a = """ SELECT symbol, MIN(date) as first_date, MAX(date) as last_date FROM stocks GROUP BY symbol ORDER BY symbol """ spark.sql(q_a).show(5) print(f"Zeit a): {time.time()-t0:.2f}s") # b) Aggregationen 2009 print("\nb) High/Low/Avg Close 2009") t0 = time.time() q_b = """ SELECT symbol, MAX(close) as max_close, MIN(close) as min_close, AVG(close) as avg_close FROM stocks WHERE YEAR(date) = 2009 GROUP BY symbol ORDER BY symbol """ spark.sql(q_b).show(5) print(f"Zeit b): {time.time()-t0:.2f}s") # c) Lateral View (Explode Portfolio) print("\nc) Lateral View: Aktien in Portfolios") t0 = time.time() q_c = """ SELECT h.symbol, SUM(h.amount) as total_shares, COUNT(p.portfolioId) as num_portfolios, AVG(h.amount) as avg_per_portfolio FROM portfolio p LATERAL VIEW explode(holdings) t AS h GROUP BY h.symbol ORDER BY h.symbol """ spark.sql(q_c).show(5) print(f"Zeit c): {time.time()-t0:.2f}s") # d) Symbole in keinem Portfolio (Anti Join) print("\nd) Symbole ohne Portfolio") t0 = time.time() q_d = """ SELECT DISTINCT s.symbol FROM stocks s LEFT ANTI JOIN ( SELECT DISTINCT h.symbol FROM portfolio p LATERAL VIEW explode(holdings) t AS h ) p_sym ON s.symbol = p_sym.symbol ORDER BY s.symbol """ spark.sql(q_d).show(5) print(f"Zeit d): {time.time()-t0:.2f}s") input(">> 12 a-d fertig. Check UI. Enter für e)...") # e) Portfolio Wert Ende 2010 print("\ne) Portfolio Bewertung Ende 2010") t0 = time.time() q_last_price = """ SELECT symbol, close FROM ( SELECT symbol, close, ROW_NUMBER() OVER (PARTITION BY symbol ORDER BY date DESC) as rn FROM stocks WHERE YEAR(date) = 2010 ) tmp WHERE rn = 1 """ spark.sql(q_last_price).createOrReplaceTempView("stocks_2010_end") # Schritt 2: Portfolio explodieren, mit Preis joinen, berechnen, summieren q_val = """ SELECT p.portfolioId, SUM(h.amount * s.close) as portfolio_value_2010 FROM portfolio p LATERAL VIEW explode(holdings) t AS h JOIN stocks_2010_end s ON h.symbol = s.symbol GROUP BY p.portfolioId ORDER BY p.portfolioId """ spark.sql(q_val).show(5) print(f"Zeit e): {time.time()-t0:.2f}s") def main(scon, spark): # Aufgabe 12 init_view_stocks(spark) task_12_stocks_analysis(spark) if __name__ == '__main__': main(scon, spark)