feat: ✨ Extraction now part of the main workflow

2025-12-11 08:47:29 +00:00
parent 1c6418e044
commit d386317957
8 changed files with 112 additions and 18 deletions
@@ -10,6 +10,8 @@ wheels/
 .venv

 dat_other/*
+tar_files/*
+gz_files/*
 dat_files/*
 asc_files/*
 csv_files/*
@@ -9,16 +9,23 @@ The project consists of a main pipeline workflow that processes multiple modules
 - `main.py`: Main pipeline orchestrator that calls on the modules as needed
 - `batch_nimrod.py`: Module for batch processing multiple NIMROD files with configurable bounding boxes
 - `generate_timeseries.py`: Module for extracting cropped rain data and creating rainfall timeseries
+- `extract.py`: Module for extracting the dat files from the .gz.tar files that are downloaded from source

 ## Features

 ### main.py

 - Orchestrates the entire workflow pipeline
+- Uncompress the packed .gz.tar files to DAT files
 - Processes DAT files to ASC format
 - Generates timeseries data for specified locations 
 - Combines grouped CSV files into consolidated datasets formatted for Infoworks ICM

+### extract.py
+
+- Converts all .gz.tar files first to 288 (1 day) of .gz files
+- Converts all .gz files to .dat files ready for processing.
+
 ### batch_nimrod.py

 - Process multiple NIMROD dat files
@@ -44,24 +51,28 @@ It is recommended to use UV for environment and package handling.

 1. Ensure all required packages are installed `uv sync`
 1. Adjust the config.py file to match your needs.
-1. Ensure your .dat files are in the DAT_TOP_FOLDER (as per config location)
+1. Ensure your .gz.tar files are in the TAR_TOP_FOLDER (as per config location)
 1. Ensure your zone csv files are in the ZONE_FOLDER (as per config location)
 1. RunMain Pipeline `uv run main.py` Note that you will have to set your environment variable `PYTHON_GIL=0` first
 1. find the output in the COMBINED_FOLDER (as per config location)

 The main pipeline will:

-1. Process DAT files to ASC format if needed
+1. Uncompress the .gz.tar files ready for processing
+1. Process DAT files to ASC format
 1. Generate timeseries data for specified locations
-1. Combine grouped CSV files into consolidated datasets
+1. Combine grouped locations into consolidated datasets

 ## Configuration

-The `config.py` file defines folder paths:
+The `config.py` file defines folder paths and file deletion options:

- DAT_TOP_FOLDER: "./dat_files"
- ASC_TOP_FOLDER: "./asc_files"
- COMBINED_FOLDER: "./combined_files"
+- TAR_TOP_FOLDER = "./tar_files"
+- GZ_TOP_FOLDER = "./gz_files"
+- DAT_TOP_FOLDER = "./dat_files"
+- ASC_TOP_FOLDER = "./asc_files"
+- COMBINED_FOLDER = "./combined_files"
+- ZONE_FOLDER = "./zone_inputs"

 Example of how the zone csv files should look:

@@ -1,8 +1,13 @@
 class Config:
+    TAR_TOP_FOLDER = "./tar_files"
+    GZ_TOP_FOLDER = "./gz_files"
    DAT_TOP_FOLDER = "./dat_files"
    ASC_TOP_FOLDER = "./asc_files"
    COMBINED_FOLDER = "./combined_files"
+
    ZONE_FOLDER = "./zone_inputs"

-    delete_dat_after_processing = False
+    delete_tar_after_processing = False
+    delete_gz_after_processing = True
+    delete_dat_after_processing = True
    delete_asc_after_processing = True
@@ -6,12 +6,13 @@ import concurrent.futures
 from pathlib import Path

 from config import Config
-from modules import BatchNimrod, GenerateTimeseries
+from modules import BatchNimrod, GenerateTimeseries, Extract

 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )

+
 def process_pipeline(dat_file):
    # 1. Process DAT to ASC
    asc_file = batch._process_single_file(dat_file)
@@ -22,9 +23,21 @@ def process_pipeline(dat_file):
    file_results = timeseries.process_asc_file(asc_file, locations)
    return file_results

+
+def initialise_folders():
+    folder_list = [
+        Config.ASC_TOP_FOLDER,
+        Config.COMBINED_FOLDER,
+        Config.GZ_TOP_FOLDER,
+        Config.DAT_TOP_FOLDER,
+        Config.TAR_TOP_FOLDER,
+    ]
+    for path in folder_list:
+        Path(path).mkdir(exist_ok=True)
+
+
 if __name__ == "__main__":
-    os.makedirs(Path(Config.ASC_TOP_FOLDER), exist_ok=True)
-    os.makedirs(Path(Config.COMBINED_FOLDER), exist_ok=True)
+    initialise_folders()

    locations = []
    zones = set()
@@ -44,6 +57,7 @@ if __name__ == "__main__":
    logging.info(f"Count of 1km Grids: {len(locations)}")
    logging.info(f"Count of Zones: {len(zones)}")

+    extraction = Extract(Config)
    batch = BatchNimrod(Config)
    timeseries = GenerateTimeseries(Config, locations)

@@ -55,6 +69,9 @@ if __name__ == "__main__":
    # Initialize results structure
    results = {loc[0]: {"dates": [], "values": []} for loc in locations}

+    logging.info("Extracting tar and gz files")
+    extraction.run_extraction()
+
    # Get list of DAT files
    dat_files = [
        f for f in os.listdir(Path(Config.DAT_TOP_FOLDER)) if not f.startswith(".")
@@ -1,9 +1,6 @@
 from .nimrod import Nimrod
 from .batch_nimrod import BatchNimrod
 from .generate_timeseries import GenerateTimeseries
+from .extract import Extract

-__all__ = [
-    "Nimrod",
-    "BatchNimrod",
-    "GenerateTimeseries",
-]
+__all__ = ["Nimrod", "BatchNimrod", "GenerateTimeseries", "Extract"]
@@ -0,0 +1,62 @@
+import tarfile
+import gzip
+import shutil
+import os
+from pathlib import Path
+
+
+class Extract:
+    # Directory containing .tar files
+    def __init__(self, Config):
+        self.config = Config
+
+    def _extract_tar(self):
+        for tar_file in os.listdir(self.config.TAR_TOP_FOLDER):
+            # only handle .tar files
+            if not tar_file.endswith(".tar"):
+                pass
+
+            tar_path = Path(self.config.TAR_TOP_FOLDER, tar_file)
+
+            # Create a folder for extracted tar contents
+            extract_folder = Path(
+                self.config.GZ_TOP_FOLDER, tar_file.replace(".tar", "")
+            )
+            Path(extract_folder).mkdir(exist_ok=True)
+
+            # Extract .tar file
+            with tarfile.open(tar_path, "r") as tar:
+                tar.extractall(path=extract_folder)
+
+            if self.config.delete_tar_after_processing:
+                os.remove(tar_path)
+
+    def _extract_gz(self):
+        for root, _, files in os.walk(self.config.GZ_TOP_FOLDER):
+            for file in files:
+                # only handle .gz files
+                if not file.endswith(".dat.gz"):
+                    pass  # adjust if extension differs
+                gz_path = Path(root, file)
+                dat_path = Path(self.config.DAT_TOP_FOLDER, file.replace(".gz", ""))
+
+                # Unzip .gz file
+                with gzip.open(gz_path, "rb") as f_in:
+                    with open(dat_path, "wb") as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+
+                if self.config.delete_gz_after_processing:
+                    os.remove(gz_path)
+
+        try:
+            shutil.rmtree(self.config.GZ_TOP_FOLDER)
+            print("processing complete and GZ files deleted")
+        except Exception as e:
+            print(str(e))
+            print(
+                f"processing complete but GZ folder delete failed. Please delete manually ({self.config.GZ_TOP_FOLDER})"
+            )
+
+    def run_extraction(self):
+        self._extract_tar()
+        self._extract_gz()
@@ -1,6 +1,6 @@
 [project]
 name = "met-office"
-version = "1.1.1"
+version = "1.2.0"
 description = "Convert .dat nimrod files to .asc files"
 readme = "README.md"
 requires-python = ">=3.14"
@@ -4,7 +4,7 @@ requires-python = ">=3.14"

 [[package]]
 name = "met-office"
-version = "1.1.1"
+version = "1.2.0"
 source = { virtual = "." }
 dependencies = [
    { name = "numpy" },