mirror of
https://github.com/Vale54321/BigData.git
synced 2025-12-16 11:49:34 +01:00
12
This commit is contained in:
123
Aufgabe 12/Aufgabe12.py
Normal file
123
Aufgabe 12/Aufgabe12.py
Normal file
@@ -0,0 +1,123 @@
|
||||
from sparkstart import scon, spark
|
||||
from pyspark.sql import SparkSession
|
||||
import time
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
HDFSPATH_STATIONS = "hdfs://193.174.205.250:54310/home/heiserervalentin/"
|
||||
HDFSPATH_STOCKS = "hdfs://193.174.205.250:54310/stocks/"
|
||||
|
||||
def init_view_stocks(spark):
|
||||
"""Lädt die Stocks-Daten für Aufgabe 12"""
|
||||
# Hinweis: Pfade anpassen, falls sie im Cluster anders liegen
|
||||
spark.read.parquet(HDFSPATH_STOCKS + "stocks.parquet").createOrReplaceTempView("stocks")
|
||||
spark.read.parquet(HDFSPATH_STOCKS + "portfolio.parquet").createOrReplaceTempView("portfolio")
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# AUFGABE 12
|
||||
# ---------------------------------------------------------
|
||||
|
||||
def task_12_stocks_analysis(spark: SparkSession):
|
||||
print("\n--- Aufgabe 12: Stocks & Portfolio ---")
|
||||
|
||||
# a) Erstes und letztes Datum je Symbol
|
||||
print("a) Min/Max Datum pro Symbol")
|
||||
t0 = time.time()
|
||||
q_a = """
|
||||
SELECT symbol, MIN(date) as first_date, MAX(date) as last_date
|
||||
FROM stocks
|
||||
GROUP BY symbol
|
||||
ORDER BY symbol
|
||||
"""
|
||||
spark.sql(q_a).show(5)
|
||||
print(f"Zeit a): {time.time()-t0:.2f}s")
|
||||
|
||||
# b) Aggregationen 2009
|
||||
print("\nb) High/Low/Avg Close 2009")
|
||||
t0 = time.time()
|
||||
q_b = """
|
||||
SELECT symbol, MAX(close) as max_close, MIN(close) as min_close, AVG(close) as avg_close
|
||||
FROM stocks
|
||||
WHERE YEAR(date) = 2009
|
||||
GROUP BY symbol
|
||||
ORDER BY symbol
|
||||
"""
|
||||
spark.sql(q_b).show(5)
|
||||
print(f"Zeit b): {time.time()-t0:.2f}s")
|
||||
|
||||
# c) Lateral View (Explode Portfolio)
|
||||
print("\nc) Lateral View: Aktien in Portfolios")
|
||||
t0 = time.time()
|
||||
|
||||
q_c = """
|
||||
SELECT
|
||||
h.symbol,
|
||||
SUM(h.amount) as total_shares,
|
||||
COUNT(p.portfolioId) as num_portfolios,
|
||||
AVG(h.amount) as avg_per_portfolio
|
||||
FROM portfolio p
|
||||
LATERAL VIEW explode(holdings) t AS h
|
||||
GROUP BY h.symbol
|
||||
ORDER BY h.symbol
|
||||
"""
|
||||
spark.sql(q_c).show(5)
|
||||
print(f"Zeit c): {time.time()-t0:.2f}s")
|
||||
|
||||
# d) Symbole in keinem Portfolio (Anti Join)
|
||||
print("\nd) Symbole ohne Portfolio")
|
||||
t0 = time.time()
|
||||
q_d = """
|
||||
SELECT DISTINCT s.symbol
|
||||
FROM stocks s
|
||||
LEFT ANTI JOIN (
|
||||
SELECT DISTINCT h.symbol
|
||||
FROM portfolio p
|
||||
LATERAL VIEW explode(holdings) t AS h
|
||||
) p_sym ON s.symbol = p_sym.symbol
|
||||
ORDER BY s.symbol
|
||||
"""
|
||||
spark.sql(q_d).show(5)
|
||||
print(f"Zeit d): {time.time()-t0:.2f}s")
|
||||
|
||||
input(">> 12 a-d fertig. Check UI. Enter für e)...")
|
||||
|
||||
# e) Portfolio Wert Ende 2010
|
||||
print("\ne) Portfolio Bewertung Ende 2010")
|
||||
t0 = time.time()
|
||||
|
||||
q_last_price = """
|
||||
SELECT symbol, close
|
||||
FROM (
|
||||
SELECT
|
||||
symbol,
|
||||
close,
|
||||
ROW_NUMBER() OVER (PARTITION BY symbol ORDER BY date DESC) as rn
|
||||
FROM stocks
|
||||
WHERE YEAR(date) = 2010
|
||||
) tmp
|
||||
WHERE rn = 1
|
||||
"""
|
||||
spark.sql(q_last_price).createOrReplaceTempView("stocks_2010_end")
|
||||
|
||||
# Schritt 2: Portfolio explodieren, mit Preis joinen, berechnen, summieren
|
||||
q_val = """
|
||||
SELECT
|
||||
p.portfolioId,
|
||||
SUM(h.amount * s.close) as portfolio_value_2010
|
||||
FROM portfolio p
|
||||
LATERAL VIEW explode(holdings) t AS h
|
||||
JOIN stocks_2010_end s ON h.symbol = s.symbol
|
||||
GROUP BY p.portfolioId
|
||||
ORDER BY p.portfolioId
|
||||
"""
|
||||
spark.sql(q_val).show(5)
|
||||
print(f"Zeit e): {time.time()-t0:.2f}s")
|
||||
|
||||
|
||||
def main(scon, spark):
|
||||
# Aufgabe 12
|
||||
init_view_stocks(spark)
|
||||
task_12_stocks_analysis(spark)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(scon, spark)
|
||||
22
Aufgabe 12/sparkstart.py
Normal file
22
Aufgabe 12/sparkstart.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Erzeugen einer Spark-Konfiguration
|
||||
"""
|
||||
|
||||
from pyspark import SparkConf, SparkContext
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
# connect to cluster
|
||||
conf = SparkConf().setMaster("spark://193.174.205.250:7077").setAppName("HeisererValentin")
|
||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
conf.set("spark.executor.memory", '32g')
|
||||
conf.set("spark.driver.memory", '8g')
|
||||
conf.set("spark.cores.max", "40")
|
||||
scon = SparkContext(conf=conf)
|
||||
|
||||
|
||||
spark = SparkSession \
|
||||
.builder \
|
||||
.appName("Python Spark SQL") \
|
||||
.getOrCreate()
|
||||
Reference in New Issue
Block a user