From 350464164349e3f084e7510ccd06bad5ad603013 Mon Sep 17 00:00:00 2001 From: Jake Pullen Date: Sat, 10 Aug 2024 21:47:08 +0100 Subject: [PATCH 1/4] understanding dash --- dash_app.py | 22 ++++++++++++++++++++++ requirements.txt | 3 ++- 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 dash_app.py diff --git a/dash_app.py b/dash_app.py new file mode 100644 index 0000000..d08736d --- /dev/null +++ b/dash_app.py @@ -0,0 +1,22 @@ +# Import packages +from dash import Dash, html, dash_table +import polars as pd + +# Incorporate data +df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder2007.csv') + +# Convert DataFrame to list of dictionaries +data = df.to_pandas().to_dict('records') + +# Initialize the app +app = Dash() + +# App layout +app.layout = [ + html.Div(children='My First App with Data'), + dash_table.DataTable(data=data) +] + +# Run the app +if __name__ == '__main__': + app.run(debug=True) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5950c70..d7b59fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ python-dotenv polars requests -pyyaml \ No newline at end of file +pyyaml +dash \ No newline at end of file From 201f8eb2c9b95dfd34b82eb4547a277f16682bcf Mon Sep 17 00:00:00 2001 From: Jake Pullen Date: Sun, 11 Aug 2024 10:42:05 +0100 Subject: [PATCH 2/4] more dash workings --- dash_app.py | 86 +++++++++++++++++++++++++++++++++++++++++------- requirements.txt | 6 +++- 2 files changed, 80 insertions(+), 12 deletions(-) diff --git a/dash_app.py b/dash_app.py index d08736d..64f2055 100644 --- a/dash_app.py +++ b/dash_app.py @@ -1,21 +1,85 @@ -# Import packages -from dash import Dash, html, dash_table -import polars as pd +import polars as pl +import plotly.express as px +from dash import Dash, html, dcc +import dash_bootstrap_components as dbc +from dash.dash_table import DataTable # Incorporate data -df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder2007.csv') +df = pl.read_parquet('data/warehouse/transactions.parquet') +print("Data loaded from Parquet file:") +print(df) + +relevant_data = df.sql(''' + SELECT + date, + sum(transaction_amount) as total + FROM self + GROUP BY date + ORDER BY date DESC + ''' +) +print("Data after SQL query:") +print(relevant_data) # Convert DataFrame to list of dictionaries -data = df.to_pandas().to_dict('records') +data = relevant_data.to_dicts() +print("Data converted to list of dictionaries:") +print(data) -# Initialize the app -app = Dash() +# Initialize the app with a dark theme +app = Dash(external_stylesheets=[dbc.themes.DARKLY]) + +# Create the line graph with dark mode styling +fig = px.line(relevant_data.to_pandas(), x="date", y="total", title='Spend Per Day') +fig.update_layout( + plot_bgcolor='black', + paper_bgcolor='black', + font_color='white' +) # App layout -app.layout = [ - html.Div(children='My First App with Data'), - dash_table.DataTable(data=data) -] +app.layout = dbc.Container( + [ + dbc.Row( + dbc.Col(html.Div("My First App with My Data", className="text-center text-light"), width=12) + ), + dbc.Row( + [ + dbc.Col( + dbc.Card( + dbc.CardBody( + [ + html.H4("Data Table", className="card-title"), + DataTable( + data=data, + columns=[{"name": i, "id": i} for i in relevant_data.columns], + page_size=5, + style_header={'backgroundColor': 'black', 'color': 'white'}, + style_cell={'backgroundColor': 'black', 'color': 'white'} + ) + ] + ), + className="mb-4" + ), + width=6 + ), + dbc.Col( + dbc.Card( + dbc.CardBody( + [ + html.H4("Spend Per Day", className="card-title"), + dcc.Graph(figure=fig) + ] + ), + className="mb-4" + ), + width=6 + ) + ] + ) + ], + fluid=True +) # Run the app if __name__ == '__main__': diff --git a/requirements.txt b/requirements.txt index d7b59fe..1642cd8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,8 @@ python-dotenv polars requests pyyaml -dash \ No newline at end of file +#visualisation requirements below +dash +pandas +pyarrow +dash-bootstrap-components \ No newline at end of file From 7b80b529989d387f5dfc577daf5d83ff7c53d3d6 Mon Sep 17 00:00:00 2001 From: Jake Pullen Date: Tue, 27 Aug 2024 15:12:44 +0100 Subject: [PATCH 3/4] changes to make dash app work --- dash_app.py | 129 ++++++++++++++++++++++++++++++---------- pipeline/dimensions.py | 12 ++++ pipeline/facts.py | 17 +++++- pipeline/raw_to_base.py | 2 +- 4 files changed, 126 insertions(+), 34 deletions(-) diff --git a/dash_app.py b/dash_app.py index 64f2055..36df2b4 100644 --- a/dash_app.py +++ b/dash_app.py @@ -1,47 +1,106 @@ +'''Module to create a Dash app that displays visualizations of YNAB data.''' + import polars as pl import plotly.express as px from dash import Dash, html, dcc import dash_bootstrap_components as dbc -from dash.dash_table import DataTable +import pandas as pd -# Incorporate data -df = pl.read_parquet('data/warehouse/transactions.parquet') -print("Data loaded from Parquet file:") -print(df) +# Load data +accounts = pl.read_parquet('data/warehouse/accounts.parquet') +categories = pl.read_parquet('data/warehouse/categories.parquet') +dates = pl.read_parquet('data/warehouse/dates.parquet') +payees = pl.read_parquet('data/warehouse/payees.parquet') +scheduled_transactions = pl.read_parquet('data/warehouse/scheduled_transactions.parquet') +transactions = pl.read_parquet('data/warehouse/transactions.parquet') -relevant_data = df.sql(''' +# Join transactions with accounts, categories, and payees to create a master DataFrame +master_df = transactions.join(categories, left_on='category_id', right_on='id', suffix='_category')\ + .join(accounts, left_on='account_id', right_on='id', suffix='_account')\ + .join(payees, left_on='payee_id', right_on='id', suffix='_payee')\ + .join(dates, left_on='transaction_date', right_on='date_id', suffix='_date')\ + +# Create aggregations +spend_per_day = master_df.sql(''' SELECT date, - sum(transaction_amount) as total + year, + month, + day, + ABS(SUM(transaction_amount)) as total FROM self - GROUP BY date + WHERE category_name != 'Inflow: Ready to Assign' + GROUP BY date, year, month, day ORDER BY date DESC ''' ) -print("Data after SQL query:") -print(relevant_data) + +spend_per_category = master_df.sql(''' + SELECT + category_name, + ABS(SUM(transaction_amount)) as total + FROM self + WHERE category_name != 'Inflow: Ready to Assign' + GROUP BY category_name + ORDER BY total DESC + ''' +) + +spend_per_payee = master_df.sql(''' + SELECT + payee_name, + ABS(SUM(transaction_amount)) as total + FROM self + WHERE payee_name != 'Starting Balance' + AND transaction_amount < 0 + GROUP BY payee_name + ORDER BY total DESC + ''' +) # Convert DataFrame to list of dictionaries -data = relevant_data.to_dicts() -print("Data converted to list of dictionaries:") -print(data) +spend_per_day_data = spend_per_day.to_dicts() +spend_per_category_data = spend_per_category.to_dicts() +spend_per_payee_data = spend_per_payee.to_dicts() -# Initialize the app with a dark theme -app = Dash(external_stylesheets=[dbc.themes.DARKLY]) +# Convert list of dictionaries to Pandas DataFrame +spend_per_day_df = pd.DataFrame(spend_per_day_data) +spend_per_category_df = pd.DataFrame(spend_per_category_data) +spend_per_payee_df = pd.DataFrame(spend_per_payee_data) -# Create the line graph with dark mode styling -fig = px.line(relevant_data.to_pandas(), x="date", y="total", title='Spend Per Day') -fig.update_layout( +spend_per_day_line = px.line(spend_per_day_df, x="date", y="total") +spend_per_day_line.update_layout( plot_bgcolor='black', paper_bgcolor='black', font_color='white' ) +spend_per_category_bar = px.bar(spend_per_category_df, x="category_name", y="total") +spend_per_category_bar.update_layout( + plot_bgcolor='black', + paper_bgcolor='black', + font_color='white' +) + +spend_per_payee_bar = px.bar(spend_per_payee_df, x="payee_name", y="total") +spend_per_payee_bar.update_layout( + plot_bgcolor='black', + paper_bgcolor='black', + font_color='white' +) + +# Initialize the app with a dark theme +app = Dash(external_stylesheets=[dbc.themes.DARKLY]) + # App layout app.layout = dbc.Container( [ dbc.Row( - dbc.Col(html.Div("My First App with My Data", className="text-center text-light"), width=12) + dbc.Col( + html.Div("Data Pipeline For YNAB, Preview Visualisations", + className="text-center text-light"), + width=12 + ) ), dbc.Row( [ @@ -49,14 +108,24 @@ app.layout = dbc.Container( dbc.Card( dbc.CardBody( [ - html.H4("Data Table", className="card-title"), - DataTable( - data=data, - columns=[{"name": i, "id": i} for i in relevant_data.columns], - page_size=5, - style_header={'backgroundColor': 'black', 'color': 'white'}, - style_cell={'backgroundColor': 'black', 'color': 'white'} - ) + html.H4("Spend Per Day", className="card-title"), + dcc.Graph(figure=spend_per_day_line) + ] + ), + className="mb-4" + ), + width=12 + ) + ] + ), + dbc.Row( + [ + dbc.Col( + dbc.Card( + dbc.CardBody( + [ + html.H4("Spend Per Category", className="card-title"), + dcc.Graph(figure=spend_per_category_bar) ] ), className="mb-4" @@ -67,8 +136,8 @@ app.layout = dbc.Container( dbc.Card( dbc.CardBody( [ - html.H4("Spend Per Day", className="card-title"), - dcc.Graph(figure=fig) + html.H4("Spend Per Payee", className="card-title"), + dcc.Graph(figure=spend_per_payee_bar) ] ), className="mb-4" @@ -83,4 +152,4 @@ app.layout = dbc.Container( # Run the app if __name__ == '__main__': - app.run(debug=True) \ No newline at end of file + app.run(debug=True) diff --git a/pipeline/dimensions.py b/pipeline/dimensions.py index af0080b..2bc51e7 100644 --- a/pipeline/dimensions.py +++ b/pipeline/dimensions.py @@ -191,6 +191,18 @@ class DimDate(Dimensions): except Exception as e: logging.error(f"Failed to create a new column to indicate if the date is a weekday or weekend: {e}") return + + # Create a primary key by concatenating year, month, and day with no separators + try: + dates_df = dates_df.with_columns([ + (pl.col('year').cast(pl.Utf8) + + pl.col('month').cast(pl.Utf8).str.zfill(2) + + pl.col('day').cast(pl.Utf8).str.zfill(2) + ).alias('date_id') + ]) + except Exception as e: + logging.error(f"Failed to create the primary key column: {e}") + return # Write the DataFrame to a new parquet file logging.info("Writing the transformed dates DataFrame to parquet file") try: diff --git a/pipeline/facts.py b/pipeline/facts.py index 7611826..272ef11 100644 --- a/pipeline/facts.py +++ b/pipeline/facts.py @@ -27,12 +27,23 @@ class FactTransactions(Facts): # Transform the DataFrame logging.info("Transforming the transactions DataFrame") + try: + # Ensure the date column is in datetime format + transactions_df = transactions_df.with_columns([ + pl.col("date").str.strptime(pl.Date, format="%Y-%m-%d").alias("date") + ]) + except Exception as e: + logging.error(f"Failed to covert the date to date format: {e}") + return + try: transactions_df = ( transactions_df .with_columns([ pl.col("id").alias("transaction_id"), - pl.col("date").alias("transaction_date"), + (pl.col("date").dt.year().cast(pl.Utf8) + + pl.col("date").dt.month().cast(pl.Utf8).str.zfill(2) + + pl.col("date").dt.day().cast(pl.Utf8).str.zfill(2)).alias("transaction_date"), pl.col("amount").alias("transaction_amount"), pl.col("memo").alias("transaction_memo"), pl.col("cleared").alias("transaction_cleared"), @@ -45,7 +56,7 @@ class FactTransactions(Facts): ]) .with_columns([ pl.col("memo").fill_null("unknown"), - (pl.col("amount") / 100).alias("transaction_amount"), + (pl.col("amount") / 1000).alias("transaction_amount"), ]) .drop([ "transfer_transaction_id", "matched_transaction_id", "import_id", @@ -98,7 +109,7 @@ class FactScheduledTransactions(Facts): ]) .with_columns([ pl.col("memo").fill_null("unknown"), - (pl.col("amount") / 100).alias("scheduled_transaction_amount"), + (pl.col("amount") / 1000).alias("scheduled_transaction_amount"), ]) .drop([ "subtransactions", "deleted","flag_name","account_name", diff --git a/pipeline/raw_to_base.py b/pipeline/raw_to_base.py index 932bb78..88bac37 100644 --- a/pipeline/raw_to_base.py +++ b/pipeline/raw_to_base.py @@ -130,7 +130,7 @@ Then move the files back in one at a time oldest to newest and run again for eac df = df.with_columns( pl.when(pl.col(col).is_null()) .then(pl.lit("null")) - .otherwise(pl.col(col).map_elements(lambda x: str(x) if x is not None else "null")) + .otherwise(pl.col(col).map_elements(lambda x: str(x) if x is not None else "null", return_dtype=pl.Utf8)) .alias(col) ) return df From 845f6a28cc1c4a7a1d376aef09ba5a42419b1300 Mon Sep 17 00:00:00 2001 From: Jake Pullen Date: Wed, 28 Aug 2024 12:54:01 +0100 Subject: [PATCH 4/4] separation of areas --- dash_app.py | 5 ---- main.py | 51 +++++++++++++++------------------------ pipeline/pipeline_main.py | 24 ++++++++++++++++++ 3 files changed, 43 insertions(+), 37 deletions(-) create mode 100644 pipeline/pipeline_main.py diff --git a/dash_app.py b/dash_app.py index 36df2b4..9533285 100644 --- a/dash_app.py +++ b/dash_app.py @@ -6,7 +6,6 @@ from dash import Dash, html, dcc import dash_bootstrap_components as dbc import pandas as pd -# Load data accounts = pl.read_parquet('data/warehouse/accounts.parquet') categories = pl.read_parquet('data/warehouse/categories.parquet') dates = pl.read_parquet('data/warehouse/dates.parquet') @@ -149,7 +148,3 @@ app.layout = dbc.Container( ], fluid=True ) - -# Run the app -if __name__ == '__main__': - app.run(debug=True) diff --git a/main.py b/main.py index 33e686b..834983c 100644 --- a/main.py +++ b/main.py @@ -8,10 +8,8 @@ import logging.config import logging.handlers import config.exit_codes as ec -from pipeline.ingest import Ingest -from pipeline.raw_to_base import RawToBase -from pipeline.dimensions import DimAccounts, DimCategories, DimPayees, DimDate -from pipeline.facts import FactTransactions, FactScheduledTransactions +from dash_app import app +from pipeline.pipeline_main import pipeline_main def set_up_logging(): try: @@ -37,41 +35,30 @@ dotenv.load_dotenv() API_TOKEN = os.getenv('API_TOKEN') BUDGET_ID = os.getenv('BUDGET_ID') -def main(): - if not API_TOKEN or not BUDGET_ID: - logging.error('API_TOKEN or BUDGET_ID is not set in .env file') - sys.exit(ec.MISSING_ENV_VARS) - try: - with open('config/config.yaml', 'r') as file: - config = yaml.safe_load(file) - except FileNotFoundError: - logging.error('config.yaml file not found') - sys.exit(ec.MISSING_CONFIG_FILE) - except yaml.YAMLError as e: - logging.error(f'Error loading config.yaml: {e}') - sys.exit(ec.CORRUPTED_CONFIG_FILE) +if not API_TOKEN or not BUDGET_ID: + logging.error('API_TOKEN or BUDGET_ID is not set in .env file') + sys.exit(ec.MISSING_ENV_VARS) - config['API_TOKEN'] = API_TOKEN - config['BUDGET_ID'] = BUDGET_ID +try: + with open('config/config.yaml', 'r') as file: + config = yaml.safe_load(file) +except FileNotFoundError: + logging.error('config.yaml file not found') + sys.exit(ec.MISSING_CONFIG_FILE) +except yaml.YAMLError as e: + logging.error(f'Error loading config.yaml: {e}') + sys.exit(ec.CORRUPTED_CONFIG_FILE) - logging.info('Starting data pipeline') +config['API_TOKEN'] = API_TOKEN +config['BUDGET_ID'] = BUDGET_ID - Ingest(config) - RawToBase(config) - DimAccounts(config) - DimCategories(config) - DimPayees(config) - DimDate(config) - FactTransactions(config) - FactScheduledTransactions(config) - - logging.info('Data pipeline completed successfully') - sys.exit(ec.SUCCESS) + #sys.exit(ec.SUCCESS) if __name__ == '__main__': try: - main() + pipeline_main(config) + app.run() #debug=True) except SystemExit as e: exit_code = e.code if exit_code == ec.SUCCESS: diff --git a/pipeline/pipeline_main.py b/pipeline/pipeline_main.py new file mode 100644 index 0000000..05d3a9b --- /dev/null +++ b/pipeline/pipeline_main.py @@ -0,0 +1,24 @@ +'''Module to run the data pipeline''' + +import logging + +from pipeline.ingest import Ingest +from pipeline.raw_to_base import RawToBase +from pipeline.dimensions import DimAccounts, DimCategories, DimPayees, DimDate +from pipeline.facts import FactTransactions, FactScheduledTransactions + + +def pipeline_main(config): + '''Run the data pipeline''' + logging.info('Starting data pipeline') + + Ingest(config) + RawToBase(config) + DimAccounts(config) + DimCategories(config) + DimPayees(config) + DimDate(config) + FactTransactions(config) + FactScheduledTransactions(config) + + logging.info('Data pipeline completed successfully')