Merge pull request #2 from Jake-Pullen/extraction

feat: ✨ Extraction now part of the main workflow
2025-12-11 08:52:55 +00:00
parent 1c6418e044 d386317957
commit ad6b31e644
8 changed files with 112 additions and 18 deletions
@@ -10,6 +10,8 @@ wheels/
 .venv
 dat_other/*
 tar_files/*
 gz_files/*
 dat_files/*
 asc_files/*
 csv_files/*
@@ -9,16 +9,23 @@ The project consists of a main pipeline workflow that processes multiple modules
 - `main.py`: Main pipeline orchestrator that calls on the modules as needed
 - `batch_nimrod.py`: Module for batch processing multiple NIMROD files with configurable bounding boxes
 - `generate_timeseries.py`: Module for extracting cropped rain data and creating rainfall timeseries
 - `extract.py`: Module for extracting the dat files from the .gz.tar files that are downloaded from source
 ## Features
 ### main.py
 - Orchestrates the entire workflow pipeline
 - Uncompress the packed .gz.tar files to DAT files
 - Processes DAT files to ASC format
 - Generates timeseries data for specified locations 
 - Combines grouped CSV files into consolidated datasets formatted for Infoworks ICM
 ### extract.py
 - Converts all .gz.tar files first to 288 (1 day) of .gz files
 - Converts all .gz files to .dat files ready for processing.
 ### batch_nimrod.py
 - Process multiple NIMROD dat files
@@ -44,24 +51,28 @@ It is recommended to use UV for environment and package handling.
 1. Ensure all required packages are installed `uv sync`
 1. Adjust the config.py file to match your needs.
-1. Ensure your .dat files are in the DAT_TOP_FOLDER (as per config location)
+1. Ensure your .gz.tar files are in the TAR_TOP_FOLDER (as per config location)
 1. Ensure your zone csv files are in the ZONE_FOLDER (as per config location)
 1. RunMain Pipeline `uv run main.py` Note that you will have to set your environment variable `PYTHON_GIL=0` first
 1. find the output in the COMBINED_FOLDER (as per config location)
 The main pipeline will:
-1. Process DAT files to ASC format if needed
+1. Uncompress the .gz.tar files ready for processing
 1. Process DAT files to ASC format
 1. Generate timeseries data for specified locations
-1. Combine grouped CSV files into consolidated datasets
+1. Combine grouped locations into consolidated datasets
 ## Configuration
-The `config.py` file defines folder paths:
+The `config.py` file defines folder paths and file deletion options:
- DAT_TOP_FOLDER: "./dat_files"
+- TAR_TOP_FOLDER = "./tar_files"
- ASC_TOP_FOLDER: "./asc_files"
+- GZ_TOP_FOLDER = "./gz_files"
- COMBINED_FOLDER: "./combined_files"
+- DAT_TOP_FOLDER = "./dat_files"
 - ASC_TOP_FOLDER = "./asc_files"
 - COMBINED_FOLDER = "./combined_files"
 - ZONE_FOLDER = "./zone_inputs"
 Example of how the zone csv files should look:
@@ -1,8 +1,13 @@
 class Config:
    TAR_TOP_FOLDER = "./tar_files"
    GZ_TOP_FOLDER = "./gz_files"
    DAT_TOP_FOLDER = "./dat_files"
    ASC_TOP_FOLDER = "./asc_files"
    COMBINED_FOLDER = "./combined_files"
    ZONE_FOLDER = "./zone_inputs"
-    delete_dat_after_processing = False
+    delete_tar_after_processing = False
    delete_gz_after_processing = True
    delete_dat_after_processing = True
    delete_asc_after_processing = True
@@ -6,12 +6,13 @@ import concurrent.futures
 from pathlib import Path
 from config import Config
-from modules import BatchNimrod, GenerateTimeseries
+from modules import BatchNimrod, GenerateTimeseries, Extract
 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 def process_pipeline(dat_file):
    # 1. Process DAT to ASC
    asc_file = batch._process_single_file(dat_file)
@@ -22,9 +23,21 @@ def process_pipeline(dat_file):
    file_results = timeseries.process_asc_file(asc_file, locations)
    return file_results
 def initialise_folders():
    folder_list = [
        Config.ASC_TOP_FOLDER,
        Config.COMBINED_FOLDER,
        Config.GZ_TOP_FOLDER,
        Config.DAT_TOP_FOLDER,
        Config.TAR_TOP_FOLDER,
    ]
    for path in folder_list:
        Path(path).mkdir(exist_ok=True)
 if __name__ == "__main__":
-    os.makedirs(Path(Config.ASC_TOP_FOLDER), exist_ok=True)
+    initialise_folders()
    os.makedirs(Path(Config.COMBINED_FOLDER), exist_ok=True)
    locations = []
    zones = set()
@@ -44,6 +57,7 @@ if __name__ == "__main__":
    logging.info(f"Count of 1km Grids: {len(locations)}")
    logging.info(f"Count of Zones: {len(zones)}")
    extraction = Extract(Config)
    batch = BatchNimrod(Config)
    timeseries = GenerateTimeseries(Config, locations)
@@ -55,6 +69,9 @@ if __name__ == "__main__":
    # Initialize results structure
    results = {loc[0]: {"dates": [], "values": []} for loc in locations}
    logging.info("Extracting tar and gz files")
    extraction.run_extraction()
    # Get list of DAT files
    dat_files = [
        f for f in os.listdir(Path(Config.DAT_TOP_FOLDER)) if not f.startswith(".")
@@ -1,9 +1,6 @@
 from .nimrod import Nimrod
 from .batch_nimrod import BatchNimrod
 from .generate_timeseries import GenerateTimeseries
 from .extract import Extract
-__all__ = [
+__all__ = ["Nimrod", "BatchNimrod", "GenerateTimeseries", "Extract"]
    "Nimrod",
    "BatchNimrod",
    "GenerateTimeseries",
 ]
@@ -0,0 +1,62 @@
 import tarfile
 import gzip
 import shutil
 import os
 from pathlib import Path
 class Extract:
    # Directory containing .tar files
    def __init__(self, Config):
        self.config = Config
    def _extract_tar(self):
        for tar_file in os.listdir(self.config.TAR_TOP_FOLDER):
            # only handle .tar files
            if not tar_file.endswith(".tar"):
                pass
            tar_path = Path(self.config.TAR_TOP_FOLDER, tar_file)
            # Create a folder for extracted tar contents
            extract_folder = Path(
                self.config.GZ_TOP_FOLDER, tar_file.replace(".tar", "")
            )
            Path(extract_folder).mkdir(exist_ok=True)
            # Extract .tar file
            with tarfile.open(tar_path, "r") as tar:
                tar.extractall(path=extract_folder)
            if self.config.delete_tar_after_processing:
                os.remove(tar_path)
    def _extract_gz(self):
        for root, _, files in os.walk(self.config.GZ_TOP_FOLDER):
            for file in files:
                # only handle .gz files
                if not file.endswith(".dat.gz"):
                    pass  # adjust if extension differs
                gz_path = Path(root, file)
                dat_path = Path(self.config.DAT_TOP_FOLDER, file.replace(".gz", ""))
                # Unzip .gz file
                with gzip.open(gz_path, "rb") as f_in:
                    with open(dat_path, "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)
                if self.config.delete_gz_after_processing:
                    os.remove(gz_path)
        try:
            shutil.rmtree(self.config.GZ_TOP_FOLDER)
            print("processing complete and GZ files deleted")
        except Exception as e:
            print(str(e))
            print(
                f"processing complete but GZ folder delete failed. Please delete manually ({self.config.GZ_TOP_FOLDER})"
            )
    def run_extraction(self):
        self._extract_tar()
        self._extract_gz()
@@ -1,6 +1,6 @@
 [project]
 name = "met-office"
-version = "1.1.1"
+version = "1.2.0"
 description = "Convert .dat nimrod files to .asc files"
 readme = "README.md"
 requires-python = ">=3.14"
@@ -4,7 +4,7 @@ requires-python = ">=3.14"
 [[package]]
 name = "met-office"
-version = "1.1.1"
+version = "1.2.0"
 source = { virtual = "." }
 dependencies = [
    { name = "numpy" },