Files
BigData/Aufgabe 12/Aufgabe12.py
2025-12-11 21:30:51 +01:00

123 lines
3.7 KiB
Python

from sparkstart import scon, spark
from pyspark.sql import SparkSession
import time
import matplotlib.pyplot as plt
import pandas as pd
HDFSPATH_STATIONS = "hdfs://193.174.205.250:54310/home/heiserervalentin/"
HDFSPATH_STOCKS = "hdfs://193.174.205.250:54310/stocks/"
def init_view_stocks(spark):
"""Lädt die Stocks-Daten für Aufgabe 12"""
# Hinweis: Pfade anpassen, falls sie im Cluster anders liegen
spark.read.parquet(HDFSPATH_STOCKS + "stocks.parquet").createOrReplaceTempView("stocks")
spark.read.parquet(HDFSPATH_STOCKS + "portfolio.parquet").createOrReplaceTempView("portfolio")
# ---------------------------------------------------------
# AUFGABE 12
# ---------------------------------------------------------
def task_12_stocks_analysis(spark: SparkSession):
print("\n--- Aufgabe 12: Stocks & Portfolio ---")
# a) Erstes und letztes Datum je Symbol
print("a) Min/Max Datum pro Symbol")
t0 = time.time()
q_a = """
SELECT symbol, MIN(date) as first_date, MAX(date) as last_date
FROM stocks
GROUP BY symbol
ORDER BY symbol
"""
spark.sql(q_a).show(5)
print(f"Zeit a): {time.time()-t0:.2f}s")
# b) Aggregationen 2009
print("\nb) High/Low/Avg Close 2009")
t0 = time.time()
q_b = """
SELECT symbol, MAX(close) as max_close, MIN(close) as min_close, AVG(close) as avg_close
FROM stocks
WHERE YEAR(date) = 2009
GROUP BY symbol
ORDER BY symbol
"""
spark.sql(q_b).show(5)
print(f"Zeit b): {time.time()-t0:.2f}s")
# c) Lateral View (Explode Portfolio)
print("\nc) Lateral View: Aktien in Portfolios")
t0 = time.time()
q_c = """
SELECT
h.symbol,
SUM(h.amount) as total_shares,
COUNT(p.portfolioId) as num_portfolios,
AVG(h.amount) as avg_per_portfolio
FROM portfolio p
LATERAL VIEW explode(holdings) t AS h
GROUP BY h.symbol
ORDER BY h.symbol
"""
spark.sql(q_c).show(5)
print(f"Zeit c): {time.time()-t0:.2f}s")
# d) Symbole in keinem Portfolio (Anti Join)
print("\nd) Symbole ohne Portfolio")
t0 = time.time()
q_d = """
SELECT DISTINCT s.symbol
FROM stocks s
LEFT ANTI JOIN (
SELECT DISTINCT h.symbol
FROM portfolio p
LATERAL VIEW explode(holdings) t AS h
) p_sym ON s.symbol = p_sym.symbol
ORDER BY s.symbol
"""
spark.sql(q_d).show(5)
print(f"Zeit d): {time.time()-t0:.2f}s")
input(">> 12 a-d fertig. Check UI. Enter für e)...")
# e) Portfolio Wert Ende 2010
print("\ne) Portfolio Bewertung Ende 2010")
t0 = time.time()
q_last_price = """
SELECT symbol, close
FROM (
SELECT
symbol,
close,
ROW_NUMBER() OVER (PARTITION BY symbol ORDER BY date DESC) as rn
FROM stocks
WHERE YEAR(date) = 2010
) tmp
WHERE rn = 1
"""
spark.sql(q_last_price).createOrReplaceTempView("stocks_2010_end")
# Schritt 2: Portfolio explodieren, mit Preis joinen, berechnen, summieren
q_val = """
SELECT
p.portfolioId,
SUM(h.amount * s.close) as portfolio_value_2010
FROM portfolio p
LATERAL VIEW explode(holdings) t AS h
JOIN stocks_2010_end s ON h.symbol = s.symbol
GROUP BY p.portfolioId
ORDER BY p.portfolioId
"""
spark.sql(q_val).show(5)
print(f"Zeit e): {time.time()-t0:.2f}s")
def main(scon, spark):
# Aufgabe 12
init_view_stocks(spark)
task_12_stocks_analysis(spark)
if __name__ == '__main__':
main(scon, spark)