diff --git a/data_check.py b/data_check.py new file mode 100644 index 0000000..09b2c77 --- /dev/null +++ b/data_check.py @@ -0,0 +1,17 @@ +import polars as pl + +df = pl.read_parquet('data/warehouse/transactions.parquet') +print("Data loaded from Parquet file:") +print(df) + +relevant_data = df.sql(''' + SELECT + date, + sum(transaction_amount) as total + FROM self + GROUP BY date + ORDER BY date DESC + ''' +) +print("Data after SQL query:") +print(relevant_data) \ No newline at end of file diff --git a/pipeline/raw_to_base.py b/pipeline/raw_to_base.py index 1e2072a..255a8ad 100644 --- a/pipeline/raw_to_base.py +++ b/pipeline/raw_to_base.py @@ -20,6 +20,7 @@ class RawToBase: def process_entities(self): for entity in self.entities: + logging.info(f"Processing entity: {entity}") # check the file is in the raw data path, if not skip the entity folder_path = os.path.join(self.raw_data_path, entity) folder_contents = os.listdir(folder_path) @@ -40,6 +41,7 @@ class RawToBase: if not self._move_raw_to_processed(entity): logging.error(f"entity: {entity} has been processed, but we could not move the file out of the raw folder, please clear the raw folder for {entity}.") sys.exit(ec.MOVE_FILE_ERROR) + logging.info(f"Successfully processed entity: {entity}") def _load_raw_data(self, entity): entity_path = os.path.join(self.raw_data_path, entity)