From b7d0f6cd994b33d59e098a071b332e074cbf0666 Mon Sep 17 00:00:00 2001
From: Jake Pullen <hello@jake-is.me>
Date: Tue, 11 Nov 2025 11:54:28 +0000
Subject: [PATCH] =?UTF-8?q?chore:=20=F0=9F=94=A7=20More=20cleaning?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config.py                      |  38 +-------
 config.yaml                    |   3 -
 main.py                        |  74 +++++++-------
 modules/__init__.py            |   3 +-
 modules/batch_nimrod.py        |  10 +-
 modules/generate_timeseries.py | 173 +++++++++++++++------------------
 6 files changed, 130 insertions(+), 171 deletions(-)
 delete mode 100644 config.yaml

diff --git a/config.py b/config.py
index 5689239..62b0c04 100644
--- a/config.py
+++ b/config.py
@@ -1,36 +1,6 @@
-import yaml
-import logging
-
 class Config:
-    def __init__(self) -> None:
-        self.IN_TOP_FOLDER = "./dat_files"
-        self.OUT_TOP_FOLDER = "./asc_files"
-        self.CSV_TOP_FOLER = "./csv_files"
-        self.AREAS_FILE = 'areas.csv'
-
-    
-
-    def load_areas(self) -> dict:
-        """
-        Load configuration from YAML file.
-
-        Returns:
-            dict: Configuration dictionary containing bounding box information.
-
-        Raises:
-            FileNotFoundError: If the config.yaml file is not found.
-            yaml.YAMLError: If there's an error parsing the YAML file.
-        """
-        try:
-            with open(, "r") as file:
-                config = yaml.safe_load(file)
-                return config.get("bounding_box_info", {})
-        except FileNotFoundError:
-            logging.error(
-                f"Config file {CONFIG_FILE} not found. Using default configuration."
-            )
-            return {}
-        except yaml.YAMLError as e:
-            logging.error(f"Error parsing YAML file: {e}")
-            return {}
+    DAT_TOP_FOLDER = "./dat_files"
+    ASC_TOP_FOLDER = "./asc_files"
+    CSV_TOP_FOLDER = "./csv_files"
+    AREAS_FILE = 'areas.csv'
 
diff --git a/config.yaml b/config.yaml
deleted file mode 100644
index efff11f..0000000
--- a/config.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-IN_TOP_FOLDER: "./dat_files"
-OUT_TOP_FOLDER: "./asc_files"
-CSV_TOP_FOLER: "./csv_files"
diff --git a/main.py b/main.py
index aa251fa..ee8fd0d 100644
--- a/main.py
+++ b/main.py
@@ -1,46 +1,48 @@
 import logging
-import yaml
+import time
+import os
+from pathlib import Path
 
-CONFIG_FILE = "config.yaml"
+from config import Config
+from modules import BatchNimrod, GenerateTimeseries
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 
-def load_config() -> dict:
-    """
-    Load configuration from YAML file.
+if __name__ == "__main__":
+    os.makedirs(Path(Config.ASC_TOP_FOLDER), exist_ok=True)
+    os.makedirs(Path(Config.CSV_TOP_FOLDER), exist_ok=True)
+    dat_file_count = [f for f in os.listdir(Path(Config.DAT_TOP_FOLDER))]
+    asc_file_count = [f for f in os.listdir(Path(Config.ASC_TOP_FOLDER))]
 
-    Returns:
-        dict: Configuration dictionary containing bounding box information.
+    locations = [
+        # loc name, loc id, x loc,   y loc,  resolution
+        ["BRICSC", "TM0816", 608500, 216500, 1000],  
+        ["HEACSC", "TF6842", 568500, 342500, 1000], 
+    ]
 
-    Raises:
-        FileNotFoundError: If the config.yaml file is not found.
-        yaml.YAMLError: If there's an error parsing the YAML file.
-    """
-    try:
-        with open(CONFIG_FILE, "r") as file:
-            config = yaml.safe_load(file)
-            return config.get("bounding_box_info", {})
-    except FileNotFoundError:
-        logging.error(
-            f"Config file {CONFIG_FILE} not found. Using default configuration."
-        )
-        return {}
-    except yaml.YAMLError as e:
-        logging.error(f"Error parsing YAML file: {e}")
-        return {}
+    batch = BatchNimrod(Config)
+    timeseries = GenerateTimeseries(Config)
+
+    start = time.time()
+    logging.info("Starting to process DAT to ASC")
+    if dat_file_count != asc_file_count:
+        batch.process_nimrod_files()
+        batch_checkpoint = time.time()
+        elapsed_time = batch_checkpoint - start
+        logging.info(f"DAT to ASC completed in {elapsed_time:.2f} seconds")
+    else:
+        logging.info("No need to process DAT files, skipping...")
+        time.sleep(1)
+
+    for place in locations:
+        logging.info(f'{place[0]} started generating timeseries data.')
+        timeseries.extract_cropped_rain_data(place)
+        place_checkpoint = time.time()
+        since_asc_create = place_checkpoint - batch_checkpoint
+        elapsed_time = place_checkpoint - start
+        logging.info(f"{place[0]} completed in {since_asc_create:.2f} seconds")
+        logging.info(f'total time so far {elapsed_time:.2f} seconds')
     
-
-os.makedirs(Path(OUT_TOP_FOLDER), exist_ok=True)
-os.makedirs(Path(CSV_TOP_FOLDER), exist_ok=True)
-
-
-
-
-# if __name__ == "__main__":
-#     start = time.time()
-#     process_nimrod_files()
-#     end = time.time()
-#     elapsed_time = end - start
-#     logging.info(f"Processing completed in {elapsed_time:.2f} seconds")
\ No newline at end of file
+    logging.info(f'All Complete')
\ No newline at end of file
diff --git a/modules/__init__.py b/modules/__init__.py
index 24daede..4a07d68 100644
--- a/modules/__init__.py
+++ b/modules/__init__.py
@@ -1,2 +1,3 @@
 from .nimrod import Nimrod
-from .batch_nimrod import process_nimrod_files
\ No newline at end of file
+from .batch_nimrod import BatchNimrod
+from .generate_timeseries import GenerateTimeseries
\ No newline at end of file
diff --git a/modules/batch_nimrod.py b/modules/batch_nimrod.py
index f6658bf..6d8c65b 100644
--- a/modules/batch_nimrod.py
+++ b/modules/batch_nimrod.py
@@ -13,22 +13,22 @@ class BatchNimrod():
         Process all Nimrod files in the input directory, applying bounding box clipping
         and exporting to ASC format.
 
-        This function reads all files from IN_TOP_FOLDER, applies the appropriate bounding
+        This function reads all files from DAT_TOP_FOLDER, applies the appropriate bounding
         box for each area, and exports clipped raster data to OUT_TOP_FOLDER.
         """
         # Read all file names in the folder
-        files_to_process = [f for f in os.listdir(Path(self.config.IN_TOP_FOLDER))]
+        files_to_process = [f for f in os.listdir(Path(self.config.DAT_TOP_FOLDER))]
 
         logging.info(f"Processing {len(files_to_process)} files...")
 
-        for in_file in os.listdir(Path(self.config.IN_TOP_FOLDER)):
-            in_file_full = Path(self.config.IN_TOP_FOLDER, in_file)
+        for in_file in os.listdir(Path(self.config.DAT_TOP_FOLDER)):
+            in_file_full = Path(self.config.DAT_TOP_FOLDER, in_file)
 
             try:
                 image = Nimrod(open(in_file_full, "rb"))
 
                 out_file_name = f"{image.get_validity_time()}.asc"
-                out_file_path = Path(self.config.OUT_TOP_FOLDER, out_file_name)
+                out_file_path = Path(self.config.ASC_TOP_FOLDER, out_file_name)
 
                 with open(out_file_path, "w") as outfile:
                     image.extract_asc(outfile)
diff --git a/modules/generate_timeseries.py b/modules/generate_timeseries.py
index 5008d94..33ec912 100644
--- a/modules/generate_timeseries.py
+++ b/modules/generate_timeseries.py
@@ -4,124 +4,113 @@ import glob
 import pandas as pd
 from datetime import datetime
 
-# Configuration
-asc_path = "asc_files/"
-asc_wildcard_file = "*.asc"
-asc_mult_source = asc_path + asc_wildcard_file
 
-def read_ascii_header(ascii_raster_file: str) -> list:
-    """Reads header information from an ASCII DEM
+class GenerateTimeseries:
+    def __init__(self, config):
+        self.config = config
 
-    Args:
-        ascii_raster_file (str): Path to the ASCII raster file
+    def _read_ascii_header(self, ascii_raster_file: str) -> list:
+        """Reads header information from an ASCII DEM
 
-    Returns:
-        list: Header data as a list of floats
-    """
-    with open(ascii_raster_file) as f:
-        header_data = [float(f.__next__().split()[1]) for x in range(6)]
-    return header_data
+        Args:
+            ascii_raster_file (str): Path to the ASCII raster file
+
+        Returns:
+            list: Header data as a list of floats
+        """
+        with open(ascii_raster_file) as f:
+            header_data = [float(f.__next__().split()[1]) for x in range(6)]
+        return header_data
 
 
-def calculate_crop_coords(basin_header: list, radar_header: list) -> tuple:
-    """Calculate crop coordinates based on header data
+    def _calculate_crop_coords(self, basin_header: list, radar_header: list) -> tuple:
+        """Calculate crop coordinates based on header data
 
-    Args:
-        basin_header (list): Basin header data
-        radar_header (list): Radar header data
+        Args:
+            basin_header (list): Basin header data
+            radar_header (list): Radar header data
 
-    Returns:
-        tuple: (start_col, start_row, end_col, end_row) as integers
-    """
-    y0_radar = radar_header[3]
-    x0_radar = radar_header[2]
+        Returns:
+            tuple: (start_col, start_row, end_col, end_row) as integers
+        """
+        y0_radar = radar_header[3]
+        x0_radar = radar_header[2]
 
-    y0_basin = basin_header[3]
-    x0_basin = basin_header[2]
+        y0_basin = basin_header[3]
+        x0_basin = basin_header[2]
 
-    nrows_radar = radar_header[1]
+        nrows_radar = radar_header[1]
 
-    nrows_basin = 2  # hardcoded, we always expect 2 rows
-    ncols_basin = 2  # hardcoded, we always expect 2 columns
+        nrows_basin = 2  # hardcoded, likely to change?
+        ncols_basin = 2  # hardcoded, likely to change?
 
-    cellres_radar = radar_header[4]
-    cellres_basin = basin_header[4]
+        cellres_radar = radar_header[4]
+        cellres_basin = basin_header[4]
 
-    xp = x0_basin - x0_radar
-    yp = y0_basin - y0_radar
+        xp = x0_basin - x0_radar
+        yp = y0_basin - y0_radar
 
-    xpp = ncols_basin * cellres_basin
-    ypp = nrows_basin * cellres_basin
+        xpp = ncols_basin * cellres_basin
+        ypp = nrows_basin * cellres_basin
 
-    start_col = np.floor(xp / cellres_radar)
-    end_col = np.ceil((xpp + xp) / cellres_radar)
+        start_col = np.floor(xp / cellres_radar)
+        end_col = np.ceil((xpp + xp) / cellres_radar)
 
-    start_row = np.floor(nrows_radar - ((yp + ypp) / cellres_radar))
-    end_row = np.ceil(nrows_radar - (yp / cellres_radar))
+        start_row = np.floor(nrows_radar - ((yp + ypp) / cellres_radar))
+        end_row = np.ceil(nrows_radar - (yp / cellres_radar))
 
-    #print(start_col, start_row, end_col, end_row)
-    return int(start_col), int(start_row), int(end_col), int(end_row)
+        #print(start_col, start_row, end_col, end_row)
+        return int(start_col), int(start_row), int(end_col), int(end_row)
 
 
-def extract_cropped_rain_data(location):
-    """Extract cropped rain data and create rainfall timeseries
+    def extract_cropped_rain_data(self, location):
+        """Extract cropped rain data and create rainfall timeseries
 
-    Returns:
-        None
-    """
-    rainfile = []
+        Returns:
+            None
+        """
+        rainfile = []
+        datetime_list = []
 
-    # Create datetime list
-    datetime_list = []
-    print(location)
-    for f in glob.iglob(asc_mult_source):
-        # print(f)
-        radar_header = read_ascii_header(f)
-        start_col, start_row, end_col, end_row = calculate_crop_coords(
-            location, radar_header
-        )
+        for f in glob.iglob(f'{self.config.ASC_TOP_FOLDER}/*.asc'):
+            # print(f)
+            radar_header = self._read_ascii_header(f)
+            start_col, start_row, end_col, end_row = self._calculate_crop_coords(
+                location, radar_header
+            )
 
-        start_col = int(round(start_col))
-        start_row = int(round(start_row))
-        end_col = int(round(end_col))
-        end_row = int(round(end_row))
+            start_col = int(round(start_col))
+            start_row = int(round(start_row))
+            end_col = int(round(end_col))
+            end_row = int(round(end_row))
 
-        cur_rawgrid = np.genfromtxt(
-            f, skip_header=6, filling_values=0.0, loose=True, invalid_raise=False
-        )
+            cur_rawgrid = np.genfromtxt(
+                f, skip_header=6, filling_values=0.0, loose=True, invalid_raise=False
+            )
 
-        cur_croppedrain = cur_rawgrid[start_row:end_row, start_col:end_col]
-        # Flatten the cropped rain data into a 1D array
-        cur_rainrow = cur_croppedrain.flatten()
-        rainfile.append(cur_rainrow)
+            cur_croppedrain = cur_rawgrid[start_row:end_row, start_col:end_col]
+            # Flatten the cropped rain data into a 1D array
+            cur_rainrow = cur_croppedrain.flatten()
+            rainfile.append(cur_rainrow[2]/32)
 
-        # Extract datetime from filename
-        filename = f.split("/")[-1]  # Get just the filename
-        # 20240929 0015
-        date_str = filename[:8]  # YYYYMMDD
-        time_str = filename[8:12]  # HHMM
+            # Extract datetime from filename
+            filename = f.split("/")[-1]  # Get just the filename
+            date_str = filename[:8]  # YYYYMMDD
+            time_str = filename[8:12]  # HHMM
 
-        # Parse datetime
-        parsed_date = datetime.strptime(f"{date_str}{time_str}", "%Y%m%d%H%M")
-        datetime_list.append(parsed_date)
+            # Parse datetime
+            parsed_date = datetime.strptime(f"{date_str}{time_str}", "%Y%m%d%H%M")
+            datetime_list.append(parsed_date)
 
-    rainfile_arr = np.vstack(rainfile)
+        rainfile_arr = np.vstack(rainfile)
 
-    # Create DataFrame with datetime index
-    df = pd.DataFrame(rainfile_arr, index=datetime_list)
-    # sort the dataframe into date order 
-    sorted_df = df.sort_index()
-    # add headers 
-    header_row = ['rainfall_1', 'rainfall_2', 'rainfall_3', 'rainfall_4']
-    file_name = f"csv_files/{location[0]}_timeseries_data.csv"
-    sorted_df.to_csv(file_name, sep=",", float_format="%1.4f", header=header_row, index_label='datetime')
+        # Create DataFrame with datetime index
+        df = pd.DataFrame(rainfile_arr, index=datetime_list)
+        # sort the dataframe into date order 
+        sorted_df = df.sort_index()
+        # add headers 
+        header_row = [location[1]]
+        file_name = f"csv_files/{location[0]}_timeseries_data.csv"
+        sorted_df.to_csv(file_name, sep=",", float_format="%1.4f", header=header_row, index_label='datetime')
 
 
-if __name__ == "__main__":
-    locations = [
-        # loc name, loc id, x loc,   y loc,  resolution
-        ["BRICSC", "TM0816", 608500, 216500, 1000],  
-        ["HEACSC", "TF6842", 568500, 342500, 1000], 
-    ]
-    for place in locations:
-        extract_cropped_rain_data(place)