Merge pull request #1 from Jake-Pullen/feature/better_error_handling

Feature/better error handling
2024-08-10 19:48:22 +01:00
parent fa6332e18a 19b633eb40
commit b8681f010f
11 changed files with 550 additions and 237 deletions
@@ -7,3 +7,4 @@ data/*
 __pycache__/*
 */__pycache__/*
 *.pbix
+/logs/*
@@ -24,3 +24,5 @@ raw_data_path: data/raw
 processed_data_path: data/processed
 base_data_path: data/base
 warehouse_data_path: data/warehouse
+REQUESTS_MAX_RETRIES: 3
+REQUESTS_RETRY_DELAY: 5
@@ -0,0 +1,41 @@
+import datetime as dt
+import json
+import logging
+from typing import override
+
+class custom_json_logger(logging.Formatter):
+    def __init__(
+        self,
+        *,
+        format_keys: dict[str,str] | None = None,
+    ):
+        super().__init__()
+        self.format_keys = format_keys if format_keys is not None else {}
+
+    @override
+    def format(self, record: logging.LogRecord) -> str:
+        record_dict = self._prepare_log_dict(record)
+        return json.dumps(record_dict, default=str)
+    
+    def _prepare_log_dict(self, record: logging.LogRecord) -> dict:
+        always_fields = {
+            "message" : record.getMessage(),
+            "timestamp" : dt.datetime.fromtimestamp(
+                record.created, tz=dt.timezone.utc
+            ).isoformat(),
+        }
+        if record.exc_info is not None:
+            always_fields["exc_info"] = self.formatException(record.exc_info)
+        
+        if record.stack_info is not None:
+            always_fields["stack_info"] = self.formatStack(record.stack_info)
+        
+        message = {
+            key: msg_val
+            if (msg_val := always_fields.pop(val, None)) is not None
+            else getattr(record, val)
+            for key, val in self.format_keys.items()
+        }
+        message.update(always_fields)
+        return message
+
@@ -0,0 +1,12 @@
+SUCCESS = 0
+MISSING_ENV_VARS = 1
+MISSING_CONFIG_FILE = 2
+CORRUPTED_CONFIG_FILE = 3
+UNAUTHORIZED_API_TOKEN = 4
+REQUESTS_ERROR = 5
+BAD_REQUEST = 6
+FORBIDDEN = 7
+NOT_FOUND = 8
+CONFLICT = 9
+MOVE_FILE_ERROR = 10
+DUPLICATE_RESOLUTION_ERROR = 11
@@ -0,0 +1,41 @@
+version: 1
+disable_existing_loggers: False
+formatters:
+  simple:
+    format: "%(asctime)s - %(levelname)s - %(module)s - %(funcName)s - %(message)s" 
+    datefmt: "%Y-%m-%d %H:%M:%S%z"
+  json:
+    "()": config.custom_json_logger.custom_json_logger
+    format_keys:
+      level: levelname
+      timestamp: timestamp
+      logger: name
+      module: module
+      function: funcName
+      line: lineno
+      message: message
+      thread_name: threadName
+handlers:
+  stderr:
+    class: logging.StreamHandler
+    level: INFO
+    formatter: simple
+    stream: ext://sys.stdout
+  file:
+    class: logging.handlers.RotatingFileHandler
+    level: DEBUG
+    formatter: json
+    filename: logs/dpfy_log.jsonl
+    maxBytes: 10485760 # 10MB
+    backupCount: 10
+  queue_handler:
+    class: logging.handlers.QueueHandler
+    handlers:
+      - stderr
+      - file
+    respect_handler_level: True
+loggers:
+  root:
+    level: DEBUG
+    handlers: 
+      - queue_handler
@@ -2,25 +2,61 @@ import os
 import dotenv
 import logging
 import yaml
+import sys
+import atexit
+import logging.config
+import logging.handlers

+import config.exit_codes as ec
 from pipeline.ingest import Ingest
 from pipeline.raw_to_base import RawToBase
 from pipeline.dimensions import DimAccounts, DimCategories, DimPayees, DimDate
 from pipeline.facts import FactTransactions, FactScheduledTransactions

+def set_up_logging():
+    try:
+        with open('config/logging_config.yaml', 'r') as f:
+            log_config = yaml.safe_load(f)
+        logging.config.dictConfig(log_config)
+    except yaml.YAMLError as e:
+        print(f"Error parsing logging configuration file: {e}")
+        log_config = {}  # Initialize log_config to an empty dictionary
+        logging.basicConfig(level=logging.INFO)  # Fallback to a basic configuration
+    queue_handler = logging.getHandlerByName('queue_handler')
+    if queue_handler is not None:
+        queue_handler.listener.start()
+        atexit.register(queue_handler.listener.stop)
+
+logger = logging.getLogger("data_pipeline_for_ynab")
+os.makedirs('logs', exist_ok=True)
+set_up_logging()
+
+# Load environment variables
 dotenv.load_dotenv()

 API_TOKEN = os.getenv('API_TOKEN')
 BUDGET_ID = os.getenv('BUDGET_ID')
-logging.basicConfig(level=logging.DEBUG)

-with open('config.yaml', 'r') as file:
+def main():
+    if not API_TOKEN or not BUDGET_ID:
+        logging.error('API_TOKEN or BUDGET_ID is not set in .env file')
+        sys.exit(ec.MISSING_ENV_VARS)
+
+    try:
+        with open('config/config.yaml', 'r') as file:
            config = yaml.safe_load(file)
+    except FileNotFoundError:
+        logging.error('config.yaml file not found')
+        sys.exit(ec.MISSING_CONFIG_FILE)
+    except yaml.YAMLError as e:
+        logging.error(f'Error loading config.yaml: {e}')
+        sys.exit(ec.CORRUPTED_CONFIG_FILE)

-config['API_TOKEN'] = API_TOKEN
-config['BUDGET_ID'] = BUDGET_ID
+    config['API_TOKEN'] = API_TOKEN
+    config['BUDGET_ID'] = BUDGET_ID
+
+    logging.info('Starting data pipeline')

-if __name__ == '__main__':
    Ingest(config)
    RawToBase(config)
    DimAccounts(config)
@@ -29,3 +65,17 @@ if __name__ == '__main__':
    DimDate(config)
    FactTransactions(config)
    FactScheduledTransactions(config)
+
+    logging.info('Data pipeline completed successfully')
+    sys.exit(ec.SUCCESS)
+
+if __name__ == '__main__':
+    try:
+        main()
+    except SystemExit as e:
+        exit_code = e.code
+        if exit_code == ec.SUCCESS:
+            logging.info('Program exited successfully')
+        else:
+            logging.error(f'Program exited with code {exit_code}')
+        raise
@@ -21,10 +21,15 @@ class DimAccounts(Dimensions):

    def transform(self):
        # Read the parquet file into a polars DataFrame
+        try:
            accounts_df = pl.read_parquet(self.file_path)
+        except Exception as e:
+            logging.error(f"Failed to read the base accounts parquet file: {e}")
+            return

        # Transform the DataFrame
        logging.info("Transforming the accounts DataFrame")
+        try:
            accounts_df = (
                accounts_df
                .with_columns([
@@ -51,10 +56,16 @@ class DimAccounts(Dimensions):
                    "debt_minimum_payments", "debt_escrow_amounts", "ingestion_date"
                ])
            )
+        except Exception as e:
+            logging.error(f"Failed to transform the accounts DataFrame: {e}")
+            return
        # Write the DataFrame to a new parquet file
        logging.info("Writing the transformed accounts DataFrame to parquet file")
+        try:
            accounts_df.write_parquet(self.config['warehouse_data_path'] + '/accounts.parquet')
-
+        except Exception as e:
+            logging.error(f"Failed to write the transformed accounts DataFrame to parquet file: {e}")
+            return

 class DimCategories(Dimensions):
    def __init__(self, config):
@@ -64,9 +75,13 @@ class DimCategories(Dimensions):
    
    def transform(self):
        # Read the parquet file into a polars DataFrame
+        try:
            categories_df = pl.read_parquet(self.file_path)
+        except Exception as e: 
+            logging.error(f"Failed to read the base categories parquet file: {e}")
+            return
        logging.info("Transforming the categories DataFrame")
-        # Select the required columns
+        try:
            categories_df = categories_df.select([
                'id',
                'name',
@@ -78,6 +93,11 @@ class DimCategories(Dimensions):
                'balance',
                'deleted'
            ])
+        except Exception as e:
+            logging.error(f"Failed to select columns from the categories DataFrame: {e}")
+            return
+        
+        try:
            # Rename the columns
            categories_df = categories_df.with_columns(pl.col('id').alias('category_id'))
            categories_df = categories_df.with_columns(pl.col('name').alias('category_name'))
@@ -89,10 +109,17 @@ class DimCategories(Dimensions):
            categories_df = categories_df.with_columns(pl.col('balance') / 100)
            categories_df = categories_df.with_columns(pl.col('budgeted') / 100)
            categories_df = categories_df.with_columns(pl.col('activity') / 100)
+        except Exception as e:
+            logging.error(f"Failed to transform the categories DataFrame: {e}")
+            return

        # Write the DataFrame to a new parquet file
        logging.info("Writing the transformed categories DataFrame to parquet file")
+        try:
            categories_df.write_parquet(self.config['warehouse_data_path'] + '/categories.parquet')
+        except Exception as e:
+            logging.error(f"Failed to write the transformed categories DataFrame to parquet file: {e}")
+            return
        
 class DimPayees(Dimensions):
    def __init__(self, config):
@@ -102,22 +129,36 @@ class DimPayees(Dimensions):
    
    def transform(self):
        # Read the parquet file into a polars DataFrame
+        try:
            payees_df = pl.read_parquet(self.file_path)
+        except Exception as e:
+            logging.error(f"Failed to read the base payees parquet file: {e}")
+            return
        logging.info("Transforming the payees DataFrame")
-        # Select the required columns
+        try:
            payees_df = payees_df.select([
                'id',
                'name',
                'deleted'
            ])
+        except Exception as e:
+            logging.error(f"Failed to select columns from the payees DataFrame: {e}")
+            return
+        try:
            # Rename the columns
            payees_df = payees_df.with_columns(pl.col('id').alias('payee_id'))
            payees_df = payees_df.with_columns(pl.col('name').alias('payee_name'))
+        except Exception as e:
+            logging.error(f"Failed to rename columns in the payees DataFrame: {e}")
+            return

        # Write the DataFrame to a new parquet file
        logging.info("Writing the transformed payees DataFrame to parquet file")
+        try:
            payees_df.write_parquet(self.config['warehouse_data_path'] + '/payees.parquet')
-
+        except Exception as e:
+            logging.error(f"Failed to write the transformed payees DataFrame to parquet file: {e}")
+            return

 class DimDate(Dimensions):
    def __init__(self, config):
@@ -126,20 +167,35 @@ class DimDate(Dimensions):
    
    def transform(self):
        # Create a DataFrame with dates from 2020-01-01 to 2030-12-31
+        try:
            dates_df = pl.DataFrame({'date':pl.date_range(date(2020, 1, 1), date(2030, 12, 31), "1d", eager=True)})
-        
+        except Exception as e:
+            logging.error(f"Failed to create a DataFrame with dates: {e}")
+            return
        # Extract year, month, day, and weekday from the date column
+        try:
            dates_df = dates_df.with_columns([
                pl.col('date').dt.year().alias('year'),
                pl.col('date').dt.month().alias('month'),
                pl.col('date').dt.day().alias('day'),
                pl.col('date').dt.weekday().alias('weekday')
            ])
+        except Exception as e:
+            logging.error(f"Failed to extract year, month, day, and weekday from the date column: {e}")
+            return 
+        try:
            # Create a new column to indicate if the date is a weekday or weekend
            dates_df = dates_df.with_columns([
                (pl.col('weekday') < 5).alias('is_weekday')  # True for weekdays (Monday to Friday), False for weekends (Saturday and Sunday)
            ])
+        except Exception as e:
+            logging.error(f"Failed to create a new column to indicate if the date is a weekday or weekend: {e}")
+            return
        # Write the DataFrame to a new parquet file
        logging.info("Writing the transformed dates DataFrame to parquet file")
+        try:
            dates_df.write_parquet(self.config['warehouse_data_path'] + '/dates.parquet')
+        except Exception as e:
+            logging.error(f"Failed to write the transformed dates DataFrame to parquet file: {e}")
+            return

@@ -1,7 +1,6 @@
 import polars as pl
 import logging
 import os
-from datetime import date

 class Facts:
    def __init__(self, config):
@@ -13,7 +12,6 @@ class Facts:
        return f"{self.base_file_path}/{file_name}"
    
 class FactTransactions(Facts):
-
    def __init__(self, config):
        super().__init__(config)
        self.file_path = self.get_full_file_path('transactions.parquet')
@@ -21,10 +19,15 @@ class FactTransactions(Facts):

    def transform(self):
        # Read the parquet file into a polars DataFrame
+        try:
            transactions_df = pl.read_parquet(self.file_path)
+        except FileNotFoundError:
+            logging.error("The transactions DataFrame does not exist")
+            return
        
        # Transform the DataFrame
        logging.info("Transforming the transactions DataFrame")
+        try:
            transactions_df = (
                transactions_df
                .with_columns([
@@ -51,13 +54,17 @@ class FactTransactions(Facts):
                    "debt_transaction_type","ingestion_date"
                ])
            )
-
+        except Exception as e:
+            logging.error(f"Failed to transform the transactions DataFrame: {e}")
+            return
        # Write the DataFrame to a new parquet file
        logging.info("Writing the transformed transactions DataFrame to parquet file")
+        try:
            transactions_df.write_parquet(self.config['warehouse_data_path'] + '/transactions.parquet')
+        except Exception as e:
+            logging.error(f"Failed to write the transformed transactions DataFrame: {e}")

 class FactScheduledTransactions(Facts):
-
    def __init__(self, config):
        super().__init__(config)
        self.file_path = self.get_full_file_path('scheduled_transactions.parquet')
@@ -73,6 +80,7 @@ class FactScheduledTransactions(Facts):

        # Transform the DataFrame
        logging.info("Transforming the scheduled transactions DataFrame")
+        try:
            scheduled_transactions_df = (
                scheduled_transactions_df
                .with_columns([
@@ -97,6 +105,12 @@ class FactScheduledTransactions(Facts):
                    "payee_name","category_name","ingestion_date"
                ])
            )
+        except Exception as e:
+            logging.error(f"Failed to transform the scheduled transactions DataFrame: {e}")
+            return
        # Write the DataFrame to a new parquet file
        logging.info("Writing the transformed scheduled transactions DataFrame to parquet file")
+        try:
            scheduled_transactions_df.write_parquet(self.config['warehouse_data_path'] + '/scheduled_transactions.parquet')
+        except Exception as e:
+            logging.error(f"Failed to write the transformed scheduled transactions DataFrame: {e}")
@@ -3,9 +3,14 @@ import time
 import json
 import logging
 import requests
+import sys
+import yaml
 from typing import Dict, Any
+import config.exit_codes as ec

 class Ingest:
+
+
    def __init__(self, config: Dict[str, Any]):
        """
        Initialize the Ingest class with the provided configuration.
@@ -18,6 +23,8 @@ class Ingest:
        self.raw_data_path = config['raw_data_path']
        self.headers = {'Authorization': f'Bearer {self.api_token}'}
        self.knowledge_cache = self.load_knowledge_cache()
+        self.MAX_RETRIES = config['REQUESTS_MAX_RETRIES']
+        self.RETRY_DELAY = config['REQUESTS_RETRY_DELAY']
        self.fetch_and_cache_entity_data()

    def load_knowledge_cache(self) -> Dict[str, Any]:
@@ -38,8 +45,13 @@ class Ingest:
        if not os.path.exists(directory):
            os.makedirs(directory)
        entity_file = f'{directory}/{current_time}.json'
+        logging.info(f"Saving {entity} data to {entity_file}")
+        try:
            with open(entity_file, 'w') as f:
                json.dump(data, f, indent=4)
+        except Exception as e:
+            logging.error(f"Error saving {entity} data: {e}")
+

    def update_server_knowledge_cache(self, entity: str, server_knowledge: Any):
        """
@@ -49,8 +61,7 @@ class Ingest:
            with open(self.knowledge_file, 'r') as f:
                knowledge_cache = json.load(f)
        except FileNotFoundError:
-            # If the file does not exist, create an empty cache
-            # also create the file so we can save to it later
+            logging.info(f"Knowledge file not found. Creating a new one at {self.knowledge_file}. This is normal for the first run.")
            os.makedirs(os.path.dirname(self.knowledge_file), exist_ok=True)
            knowledge_cache = {}
        
@@ -71,41 +82,74 @@ class Ingest:
            if remaining_requests < 20:
                logging.warning("Approaching rate limit. Consider pausing further requests.")
                # Implement pause or delay logic here if necessary
+            if remaining_requests == 1:
+                logging.error("Rate limit exceeded. ending requests here and moving on with what we have.")
+                return True #returning True here to break out of any more ingestions
+                
        else:
            logging.warning("X-Rate-Limit header is missing.")
    
+    def handle_response(self, response) -> bool:
+        if response.status_code == 400:
+            logging.error("Bad request. The request could not be understood by the API due to malformed syntax or validation errors.")
+            sys.exit(ec.BAD_REQUEST)
+        elif response.status_code == 401:
+            logging.error("Unauthorized. Please check your API token.")
+            sys.exit(ec.UNAUTHORIZED_API_TOKEN)
+        elif response.status_code == 403:
+            logging.error("Forbidden. Access is denied.")
+            sys.exit(ec.FORBIDDEN)
+        elif response.status_code == 404:
+            logging.error("Not found. The specified URI does not exist.")
+            sys.exit(ec.NOT_FOUND)
+        elif response.status_code == 409:
+            logging.error("Conflict. The resource cannot be saved due to a conflict.")
+            sys.exit(ec.CONFLICT)
+        elif response.status_code == 429:
+            logging.error("Too many requests. You have made too many requests in a short amount of time.")
+            return True 
+        elif response.status_code == 500:
+            logging.error("Internal server error. The API experienced an unexpected error.")
+            return True
+        elif response.status_code == 503:
+            logging.error("Service unavailable. The API is temporarily disabled or a request timeout occurred.")
+            return True
+        else:
+            response.raise_for_status()
+            return False
+
    def fetch_and_cache_entity_data(self):
        """
        Fetch and cache data for all entities.
        """
        for entity in self.entities:
-            # if we already have files in the raw data folder, we need to skip that entity
            file_path = f'data/raw/{entity}'
            if os.path.exists(file_path) and os.listdir(file_path):
-                logging.warning(f"Skipping entity: {entity} as the raw data folder is not empty.")
-                continue
+                logging.warning(f"Raw data exists for {entity} processing any raw data we already have.")
+                break # break here instead of continue as we dont want to update our server knowledge cache and potentially miss data.
+
            last_knowledge = self.knowledge_cache.get(entity, 0)
-            logging.debug(f'Last Knowledge of {entity.capitalize()}: {last_knowledge}')
-            url = f'{self.base_url}/{self.budget_id}/{entity}'
-            if last_knowledge:
+            #logging.debug(f'Last Knowledge of {entity}: {last_knowledge}')
            logging.info(f'Fetching {entity} data since last knowledge: {last_knowledge}')
-                url = url + f'?last_knowledge_of_server={last_knowledge}'
+            url = f'{self.base_url}/{self.budget_id}/{entity}?last_knowledge_of_server={last_knowledge}'

+            for attempt in range(self.MAX_RETRIES):
+                try:
                    response = requests.get(url, headers=self.headers)
-            if response.status_code == 401:
-                logging.error("Unauthorized. Please check your API token.")
-                break
-            
-            self.check_rate_limit(response)
-            
-            if response.status_code == 429:
-                logging.error("Rate limit exceeded. Pausing until the limit is reset.")
-                # Implement pause until the limit reset logic here
-                break
+                    should_retry = self.handle_response(response)
+                    if not should_retry:
+                        break  # Exit the loop if the request is successful
+                except requests.exceptions.RequestException as e:
+                    logging.error(f"Error fetching {entity} data (attempt {attempt + 1}/{self.MAX_RETRIES}): {e}")
+                    if attempt < self.MAX_RETRIES - 1:
+                        time.sleep(self.RETRY_DELAY)  # Wait before retrying
+                    else:
+                        logging.error("Max retries reached. Exiting.")
+                        sys.exit(ec.REQUESTS_ERROR)
            
            data = response.json()
            server_knowledge = data['data'].get('server_knowledge')
-            logging.debug(f'{entity.capitalize()} Server Knowledge: {server_knowledge}')
+            logging.debug(f'{entity} new server knowledge: {server_knowledge}')
            
            if server_knowledge is not None and server_knowledge != last_knowledge:
                self.update_server_knowledge_cache(entity, server_knowledge)
@@ -114,3 +158,6 @@ class Ingest:
                self.save_entity_data_to_raw(entity, entity_data)
            else:
                logging.info(f"No new data for {entity}. Skipping cache update.")
+            
+            if self.check_rate_limit(response):
+                break # break out here and continue processing the data we have.
@@ -1,8 +1,10 @@
 import os
 import json
 import logging
+import sys
 from datetime import datetime
-from typing import List, Dict, Any
+from typing import Dict, Any
+import config.exit_codes as ec
 import polars as pl

 class RawToBase:
@@ -14,7 +16,6 @@ class RawToBase:
        self.base_data_path = config['base_data_path']
        self.data = {}
        self.base_data = {}
-        logging.basicConfig(level=logging.DEBUG)
        self.process_entities()

    def process_entities(self):
@@ -22,7 +23,6 @@ class RawToBase:
            # check the file is in the raw data path, if not skip the entity
            folder_path = os.path.join(self.raw_data_path, entity)
            folder_contents = os.listdir(folder_path)
-            # Check if the folder is empty
            if not folder_contents:
                logging.warning(f"The folder {folder_path} is empty skipping {entity}.")
                continue
@@ -31,61 +31,94 @@ class RawToBase:
                continue
            self._load_existing_base_data(entity)
            self._combine_data(entity)
-            self._resolve_duplicates(entity)
-            self._save_base_data(entity)
-            self._move_raw_to_processed(entity)
+            if not self._resolve_duplicates(entity):
+                logging.error(f"entity: {entity} failed duplicate resolution.")
+                sys.exit(ec.DUPLICATE_RESOLUTION_ERROR)
+            if not self._save_base_data(entity):
+                logging.error(f"Skipping processing for entity: {entity} due to failed saving base data.")
+                continue
+            if not self._move_raw_to_processed(entity):
+                logging.error(f"entity: {entity} has been processed, but we could not move the file out of the raw folder, please clear the raw folder for {entity}.")
+                sys.exit(ec.MOVE_FILE_ERROR)
    
    def _load_raw_data(self, entity):
        entity_path = os.path.join(self.raw_data_path, entity)
        self.data[entity] = []
        logging.debug(f"Loading data for entity: {entity} from path: {entity_path}")
        
-        for file_name in os.listdir(entity_path):
-            if file_name.endswith('.json'):
+        files = [f for f in os.listdir(entity_path) if f.endswith('.json')]
+        
+        if len(files) > 1:
+            logging.error(f"""More than one file found in path: {entity_path}. Skipping processing for entity: {entity}.
+recommended actions is to move the newest file(s) out, re-run main.py.
+Then move the files back in one at a time oldest to newest and run again for each file""")
+            return False
+        
+        if len(files) == 1:
+            file_name = files[0]
            file_path = os.path.join(entity_path, file_name)
            logging.debug(f"Reading file: {file_path}")
            try:
                with open(file_path, 'r') as f:
                    data = json.load(f)
-                        # Check if the data is empty
+            except Exception as e:
+                logging.error(f"Failed to load data from file: {file_path}, error: {e}")
+                return False
+            
+            if self._is_data_empty(entity, data, file_path):
+                return False
+            
+            modified_data = self._add_ingestion_date(entity, data, file_name)
+
+            self.data[entity].append(modified_data)
+            logging.debug(f"Successfully loaded data from file: {file_path}")
+        return True
+
+    def _is_data_empty(self, entity, data, file_path):
+        logging.debug(f"Checking if data is empty for entity: {entity}")
        if entity == "categories":
-                        # Check if any category group has categories
            has_categories = any(group.get("categories") for group in data.get("category_groups", []))
            if not has_categories:
                logging.warning(f"Received empty data for entity: {entity} in file: {file_path}, deleting file.")
                os.remove(file_path)
-                            return False
+                return True
        else:
            if not data.get(entity, []):
                logging.warning(f"Received empty data for entity: {entity} in file: {file_path}, deleting file.")
-                            # delete the file as it is empty
                os.remove(file_path)
+                return True
+        logging.debug(f"Data is not empty for entity: {entity}")
        return False
+    
+    def _add_ingestion_date(self, entity, data, file_name):
        modified_data = []
+        ingestion_date = datetime.strptime(file_name.split('.')[0], '%Y%m%d%H%M%S').date()
+        
+        logging.debug(f"Adding ingestion date to data for entity: {entity}")
        if entity == 'categories':
            for group in data.get('category_groups', []):
                for category in group.get('categories', []):
-                                category['ingestion_date'] = datetime.strptime(file_name.split('.')[0], '%Y%m%d%H%M%S').date()
+                    category['ingestion_date'] = ingestion_date
                    modified_data.append(category)
        else:
            for record in data.get(f'{entity}', []):
                if isinstance(record, dict):
-                                record['ingestion_date'] = datetime.strptime(file_name.split('.')[0], '%Y%m%d%H%M%S').date()
+                    record['ingestion_date'] = ingestion_date
                    modified_data.append(record)
                else:
-                                modified_data.append({'record': record, 'ingestion_date': datetime.strptime(file_name.split('.')[0], '%Y%m%d%H%M%S').date()})
-                    self.data[entity].append(modified_data)
-                    logging.debug(f"Successfully loaded data from file: {file_path}")
-                except Exception as e:
-                    logging.error(f"Failed to load data from file: {file_path}, error: {e}")
-                    exit(1)
-        return True
+                    modified_data.append({'record': record, 'ingestion_date': ingestion_date})
+        logging.debug(f"Successfully added ingestion date to data for entity: {entity}")
+        return modified_data

    def _load_existing_base_data(self, entity):
        base_path = os.path.join(self.base_data_path, f'{entity}.parquet')
        if os.path.exists(base_path):
            logging.debug(f"Loading existing base data for entity: {entity} from path: {base_path}")
+            try:
                self.base_data[entity] = pl.read_parquet(base_path)
+            except Exception as e:
+                logging.error(f"Failed to load existing base data for entity: {entity}, error: {e}, Creating an empty DataFrame")
+                self.base_data[entity] = pl.DataFrame()
            logging.debug(f"Successfully loaded existing base data for entity: {entity}")
        else:
            self.base_data[entity] = pl.DataFrame()
@@ -103,7 +136,6 @@ class RawToBase:
                combined_data.extend(data)
        
        new_data_df = pl.DataFrame(combined_data)
-        #print(new_data_df)
        
        # Ensure the unique id column is preserved
        unique_id = self.primary_keys[entity]['unique_id']
@@ -117,35 +149,52 @@ class RawToBase:
    def _resolve_duplicates(self, entity):
        logging.debug(f"Resolving duplicates for entity: {entity}")
        unique_id = self.primary_keys[entity]['unique_id']
+        try:
            self.base_data[entity] = self.base_data[entity].sort(by='ingestion_date').unique(subset=unique_id, keep='first')
+        except Exception as e:
+            logging.error(f"Failed to resolve duplicates for entity: {entity}, error: {e}")
+            return False
        logging.debug(f"Successfully resolved duplicates for entity: {entity}")
+        return True

    def _save_base_data(self, entity):
        os.makedirs(self.base_data_path, exist_ok=True)
        file_path = os.path.join(self.base_data_path, f'{entity}.parquet')
+        try:
            self.base_data[entity].write_parquet(file_path)
+        except Exception as e:
+            logging.error(f"Failed to save base data for entity: {entity}, error: {e}")
+            return False
        logging.debug(f"Saved base data for entity: {entity} to path: {file_path}")
+        return True
    
    def _move_raw_to_processed(self, entity):
        raw_entity_path = os.path.join(self.raw_data_path, entity)
        processed_path = os.path.join(self.processed_data_path, entity)
        
-        # logging.debug(f"Raw entity path: {raw_entity_path}")
-        # logging.debug(f"Processed path: {processed_path}")
-        
        os.makedirs(processed_path, exist_ok=True)
        
-        for file_name in os.listdir(raw_entity_path):
-            if file_name.endswith('.json'):
+        try:
+            files = [f for f in os.listdir(raw_entity_path) if f.endswith('.json')]
+            if len(files) != 1:
+                logging.error(f"Expected exactly one file in path: {raw_entity_path}, but found {len(files)}")
+                return False
+            
+            file_name = files[0]
            raw_file_path = os.path.join(raw_entity_path, file_name)
            processed_file_path = os.path.join(processed_path, file_name)
            
            logging.debug(f"Moving file: {raw_file_path} to {processed_file_path}")
            
-                if os.path.exists(raw_file_path):
            os.rename(raw_file_path, processed_file_path)
-                    logging.debug(f"Moved file: {file_name}")
-                else:
-                    logging.error(f"File not found: {raw_file_path}")
+            logging.debug(f"Moved file: {file_name} to processed")
        
-        logging.debug(f"Moved processed files for entity: {entity} to path: {processed_path}")
+        except FileNotFoundError as e:
+            logging.error(f"File not found: {e}")
+            return False
+        except Exception as e:
+            logging.error(f"Failed to move file for entity: {entity}, error: {e}")
+            return False
+        
+        logging.debug(f"Moved processed file for entity: {entity} to path: {processed_path}")
+        return True