diff --git a/dash_app.py b/dash_app.py index 64f2055..36df2b4 100644 --- a/dash_app.py +++ b/dash_app.py @@ -1,47 +1,106 @@ +'''Module to create a Dash app that displays visualizations of YNAB data.''' + import polars as pl import plotly.express as px from dash import Dash, html, dcc import dash_bootstrap_components as dbc -from dash.dash_table import DataTable +import pandas as pd -# Incorporate data -df = pl.read_parquet('data/warehouse/transactions.parquet') -print("Data loaded from Parquet file:") -print(df) +# Load data +accounts = pl.read_parquet('data/warehouse/accounts.parquet') +categories = pl.read_parquet('data/warehouse/categories.parquet') +dates = pl.read_parquet('data/warehouse/dates.parquet') +payees = pl.read_parquet('data/warehouse/payees.parquet') +scheduled_transactions = pl.read_parquet('data/warehouse/scheduled_transactions.parquet') +transactions = pl.read_parquet('data/warehouse/transactions.parquet') -relevant_data = df.sql(''' +# Join transactions with accounts, categories, and payees to create a master DataFrame +master_df = transactions.join(categories, left_on='category_id', right_on='id', suffix='_category')\ + .join(accounts, left_on='account_id', right_on='id', suffix='_account')\ + .join(payees, left_on='payee_id', right_on='id', suffix='_payee')\ + .join(dates, left_on='transaction_date', right_on='date_id', suffix='_date')\ + +# Create aggregations +spend_per_day = master_df.sql(''' SELECT date, - sum(transaction_amount) as total + year, + month, + day, + ABS(SUM(transaction_amount)) as total FROM self - GROUP BY date + WHERE category_name != 'Inflow: Ready to Assign' + GROUP BY date, year, month, day ORDER BY date DESC ''' ) -print("Data after SQL query:") -print(relevant_data) + +spend_per_category = master_df.sql(''' + SELECT + category_name, + ABS(SUM(transaction_amount)) as total + FROM self + WHERE category_name != 'Inflow: Ready to Assign' + GROUP BY category_name + ORDER BY total DESC + ''' +) + +spend_per_payee = master_df.sql(''' + SELECT + payee_name, + ABS(SUM(transaction_amount)) as total + FROM self + WHERE payee_name != 'Starting Balance' + AND transaction_amount < 0 + GROUP BY payee_name + ORDER BY total DESC + ''' +) # Convert DataFrame to list of dictionaries -data = relevant_data.to_dicts() -print("Data converted to list of dictionaries:") -print(data) +spend_per_day_data = spend_per_day.to_dicts() +spend_per_category_data = spend_per_category.to_dicts() +spend_per_payee_data = spend_per_payee.to_dicts() -# Initialize the app with a dark theme -app = Dash(external_stylesheets=[dbc.themes.DARKLY]) +# Convert list of dictionaries to Pandas DataFrame +spend_per_day_df = pd.DataFrame(spend_per_day_data) +spend_per_category_df = pd.DataFrame(spend_per_category_data) +spend_per_payee_df = pd.DataFrame(spend_per_payee_data) -# Create the line graph with dark mode styling -fig = px.line(relevant_data.to_pandas(), x="date", y="total", title='Spend Per Day') -fig.update_layout( +spend_per_day_line = px.line(spend_per_day_df, x="date", y="total") +spend_per_day_line.update_layout( plot_bgcolor='black', paper_bgcolor='black', font_color='white' ) +spend_per_category_bar = px.bar(spend_per_category_df, x="category_name", y="total") +spend_per_category_bar.update_layout( + plot_bgcolor='black', + paper_bgcolor='black', + font_color='white' +) + +spend_per_payee_bar = px.bar(spend_per_payee_df, x="payee_name", y="total") +spend_per_payee_bar.update_layout( + plot_bgcolor='black', + paper_bgcolor='black', + font_color='white' +) + +# Initialize the app with a dark theme +app = Dash(external_stylesheets=[dbc.themes.DARKLY]) + # App layout app.layout = dbc.Container( [ dbc.Row( - dbc.Col(html.Div("My First App with My Data", className="text-center text-light"), width=12) + dbc.Col( + html.Div("Data Pipeline For YNAB, Preview Visualisations", + className="text-center text-light"), + width=12 + ) ), dbc.Row( [ @@ -49,14 +108,24 @@ app.layout = dbc.Container( dbc.Card( dbc.CardBody( [ - html.H4("Data Table", className="card-title"), - DataTable( - data=data, - columns=[{"name": i, "id": i} for i in relevant_data.columns], - page_size=5, - style_header={'backgroundColor': 'black', 'color': 'white'}, - style_cell={'backgroundColor': 'black', 'color': 'white'} - ) + html.H4("Spend Per Day", className="card-title"), + dcc.Graph(figure=spend_per_day_line) + ] + ), + className="mb-4" + ), + width=12 + ) + ] + ), + dbc.Row( + [ + dbc.Col( + dbc.Card( + dbc.CardBody( + [ + html.H4("Spend Per Category", className="card-title"), + dcc.Graph(figure=spend_per_category_bar) ] ), className="mb-4" @@ -67,8 +136,8 @@ app.layout = dbc.Container( dbc.Card( dbc.CardBody( [ - html.H4("Spend Per Day", className="card-title"), - dcc.Graph(figure=fig) + html.H4("Spend Per Payee", className="card-title"), + dcc.Graph(figure=spend_per_payee_bar) ] ), className="mb-4" @@ -83,4 +152,4 @@ app.layout = dbc.Container( # Run the app if __name__ == '__main__': - app.run(debug=True) \ No newline at end of file + app.run(debug=True) diff --git a/pipeline/dimensions.py b/pipeline/dimensions.py index af0080b..2bc51e7 100644 --- a/pipeline/dimensions.py +++ b/pipeline/dimensions.py @@ -191,6 +191,18 @@ class DimDate(Dimensions): except Exception as e: logging.error(f"Failed to create a new column to indicate if the date is a weekday or weekend: {e}") return + + # Create a primary key by concatenating year, month, and day with no separators + try: + dates_df = dates_df.with_columns([ + (pl.col('year').cast(pl.Utf8) + + pl.col('month').cast(pl.Utf8).str.zfill(2) + + pl.col('day').cast(pl.Utf8).str.zfill(2) + ).alias('date_id') + ]) + except Exception as e: + logging.error(f"Failed to create the primary key column: {e}") + return # Write the DataFrame to a new parquet file logging.info("Writing the transformed dates DataFrame to parquet file") try: diff --git a/pipeline/facts.py b/pipeline/facts.py index 7611826..272ef11 100644 --- a/pipeline/facts.py +++ b/pipeline/facts.py @@ -27,12 +27,23 @@ class FactTransactions(Facts): # Transform the DataFrame logging.info("Transforming the transactions DataFrame") + try: + # Ensure the date column is in datetime format + transactions_df = transactions_df.with_columns([ + pl.col("date").str.strptime(pl.Date, format="%Y-%m-%d").alias("date") + ]) + except Exception as e: + logging.error(f"Failed to covert the date to date format: {e}") + return + try: transactions_df = ( transactions_df .with_columns([ pl.col("id").alias("transaction_id"), - pl.col("date").alias("transaction_date"), + (pl.col("date").dt.year().cast(pl.Utf8) + + pl.col("date").dt.month().cast(pl.Utf8).str.zfill(2) + + pl.col("date").dt.day().cast(pl.Utf8).str.zfill(2)).alias("transaction_date"), pl.col("amount").alias("transaction_amount"), pl.col("memo").alias("transaction_memo"), pl.col("cleared").alias("transaction_cleared"), @@ -45,7 +56,7 @@ class FactTransactions(Facts): ]) .with_columns([ pl.col("memo").fill_null("unknown"), - (pl.col("amount") / 100).alias("transaction_amount"), + (pl.col("amount") / 1000).alias("transaction_amount"), ]) .drop([ "transfer_transaction_id", "matched_transaction_id", "import_id", @@ -98,7 +109,7 @@ class FactScheduledTransactions(Facts): ]) .with_columns([ pl.col("memo").fill_null("unknown"), - (pl.col("amount") / 100).alias("scheduled_transaction_amount"), + (pl.col("amount") / 1000).alias("scheduled_transaction_amount"), ]) .drop([ "subtransactions", "deleted","flag_name","account_name", diff --git a/pipeline/raw_to_base.py b/pipeline/raw_to_base.py index 932bb78..88bac37 100644 --- a/pipeline/raw_to_base.py +++ b/pipeline/raw_to_base.py @@ -130,7 +130,7 @@ Then move the files back in one at a time oldest to newest and run again for eac df = df.with_columns( pl.when(pl.col(col).is_null()) .then(pl.lit("null")) - .otherwise(pl.col(col).map_elements(lambda x: str(x) if x is not None else "null")) + .otherwise(pl.col(col).map_elements(lambda x: str(x) if x is not None else "null", return_dtype=pl.Utf8)) .alias(col) ) return df