From d386317957ea86e9698301c434c42f239c863a9b Mon Sep 17 00:00:00 2001
From: Jake Pullen <hello@jake-is.me>
Date: Thu, 11 Dec 2025 08:47:29 +0000
Subject: [PATCH] =?UTF-8?q?feat:=20=E2=9C=A8=20Extraction=20now=20part=20o?=
 =?UTF-8?q?f=20the=20main=20workflow?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore          |  2 ++
 README.MD           | 25 +++++++++++++-----
 config.py           |  7 ++++-
 main.py             | 23 ++++++++++++++---
 modules/__init__.py |  7 ++---
 modules/extract.py  | 62 +++++++++++++++++++++++++++++++++++++++++++++
 pyproject.toml      |  2 +-
 uv.lock             |  2 +-
 8 files changed, 112 insertions(+), 18 deletions(-)
 create mode 100755 modules/extract.py

diff --git a/.gitignore b/.gitignore
index 07c8c0f..5e03441 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,8 @@ wheels/
 .venv
 
 dat_other/*
+tar_files/*
+gz_files/*
 dat_files/*
 asc_files/*
 csv_files/*
diff --git a/README.MD b/README.MD
index ba377a6..50fde5d 100644
--- a/README.MD
+++ b/README.MD
@@ -9,16 +9,23 @@ The project consists of a main pipeline workflow that processes multiple modules
 - `main.py`: Main pipeline orchestrator that calls on the modules as needed
 - `batch_nimrod.py`: Module for batch processing multiple NIMROD files with configurable bounding boxes
 - `generate_timeseries.py`: Module for extracting cropped rain data and creating rainfall timeseries
+- `extract.py`: Module for extracting the dat files from the .gz.tar files that are downloaded from source
 
 ## Features
 
 ### main.py
 
 - Orchestrates the entire workflow pipeline
+- Uncompress the packed .gz.tar files to DAT files
 - Processes DAT files to ASC format
 - Generates timeseries data for specified locations 
 - Combines grouped CSV files into consolidated datasets formatted for Infoworks ICM
 
+### extract.py
+
+- Converts all .gz.tar files first to 288 (1 day) of .gz files
+- Converts all .gz files to .dat files ready for processing.
+
 ### batch_nimrod.py
 
 - Process multiple NIMROD dat files
@@ -44,24 +51,28 @@ It is recommended to use UV for environment and package handling.
 
 1. Ensure all required packages are installed `uv sync`
 1. Adjust the config.py file to match your needs.
-1. Ensure your .dat files are in the DAT_TOP_FOLDER (as per config location)
+1. Ensure your .gz.tar files are in the TAR_TOP_FOLDER (as per config location)
 1. Ensure your zone csv files are in the ZONE_FOLDER (as per config location)
 1. RunMain Pipeline `uv run main.py` Note that you will have to set your environment variable `PYTHON_GIL=0` first
 1. find the output in the COMBINED_FOLDER (as per config location)
 
 The main pipeline will:
 
-1. Process DAT files to ASC format if needed
+1. Uncompress the .gz.tar files ready for processing
+1. Process DAT files to ASC format
 1. Generate timeseries data for specified locations
-1. Combine grouped CSV files into consolidated datasets
+1. Combine grouped locations into consolidated datasets
 
 ## Configuration
 
-The `config.py` file defines folder paths:
+The `config.py` file defines folder paths and file deletion options:
 
-- DAT_TOP_FOLDER: "./dat_files"
-- ASC_TOP_FOLDER: "./asc_files"
-- COMBINED_FOLDER: "./combined_files"
+- TAR_TOP_FOLDER = "./tar_files"
+- GZ_TOP_FOLDER = "./gz_files"
+- DAT_TOP_FOLDER = "./dat_files"
+- ASC_TOP_FOLDER = "./asc_files"
+- COMBINED_FOLDER = "./combined_files"
+- ZONE_FOLDER = "./zone_inputs"
 
 Example of how the zone csv files should look:
 
diff --git a/config.py b/config.py
index 22bbd56..e5b008b 100644
--- a/config.py
+++ b/config.py
@@ -1,8 +1,13 @@
 class Config:
+    TAR_TOP_FOLDER = "./tar_files"
+    GZ_TOP_FOLDER = "./gz_files"
     DAT_TOP_FOLDER = "./dat_files"
     ASC_TOP_FOLDER = "./asc_files"
     COMBINED_FOLDER = "./combined_files"
+
     ZONE_FOLDER = "./zone_inputs"
 
-    delete_dat_after_processing = False
+    delete_tar_after_processing = False
+    delete_gz_after_processing = True
+    delete_dat_after_processing = True
     delete_asc_after_processing = True
diff --git a/main.py b/main.py
index b038ccc..fa6fa40 100644
--- a/main.py
+++ b/main.py
@@ -6,12 +6,13 @@ import concurrent.futures
 from pathlib import Path
 
 from config import Config
-from modules import BatchNimrod, GenerateTimeseries
+from modules import BatchNimrod, GenerateTimeseries, Extract
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 
+
 def process_pipeline(dat_file):
     # 1. Process DAT to ASC
     asc_file = batch._process_single_file(dat_file)
@@ -22,9 +23,21 @@ def process_pipeline(dat_file):
     file_results = timeseries.process_asc_file(asc_file, locations)
     return file_results
 
+
+def initialise_folders():
+    folder_list = [
+        Config.ASC_TOP_FOLDER,
+        Config.COMBINED_FOLDER,
+        Config.GZ_TOP_FOLDER,
+        Config.DAT_TOP_FOLDER,
+        Config.TAR_TOP_FOLDER,
+    ]
+    for path in folder_list:
+        Path(path).mkdir(exist_ok=True)
+
+
 if __name__ == "__main__":
-    os.makedirs(Path(Config.ASC_TOP_FOLDER), exist_ok=True)
-    os.makedirs(Path(Config.COMBINED_FOLDER), exist_ok=True)
+    initialise_folders()
 
     locations = []
     zones = set()
@@ -44,6 +57,7 @@ if __name__ == "__main__":
     logging.info(f"Count of 1km Grids: {len(locations)}")
     logging.info(f"Count of Zones: {len(zones)}")
 
+    extraction = Extract(Config)
     batch = BatchNimrod(Config)
     timeseries = GenerateTimeseries(Config, locations)
 
@@ -55,6 +69,9 @@ if __name__ == "__main__":
     # Initialize results structure
     results = {loc[0]: {"dates": [], "values": []} for loc in locations}
 
+    logging.info("Extracting tar and gz files")
+    extraction.run_extraction()
+
     # Get list of DAT files
     dat_files = [
         f for f in os.listdir(Path(Config.DAT_TOP_FOLDER)) if not f.startswith(".")
diff --git a/modules/__init__.py b/modules/__init__.py
index 6745012..7fa12b2 100644
--- a/modules/__init__.py
+++ b/modules/__init__.py
@@ -1,9 +1,6 @@
 from .nimrod import Nimrod
 from .batch_nimrod import BatchNimrod
 from .generate_timeseries import GenerateTimeseries
+from .extract import Extract
 
-__all__ = [
-    "Nimrod",
-    "BatchNimrod",
-    "GenerateTimeseries",
-]
+__all__ = ["Nimrod", "BatchNimrod", "GenerateTimeseries", "Extract"]
diff --git a/modules/extract.py b/modules/extract.py
new file mode 100755
index 0000000..249b520
--- /dev/null
+++ b/modules/extract.py
@@ -0,0 +1,62 @@
+import tarfile
+import gzip
+import shutil
+import os
+from pathlib import Path
+
+
+class Extract:
+    # Directory containing .tar files
+    def __init__(self, Config):
+        self.config = Config
+
+    def _extract_tar(self):
+        for tar_file in os.listdir(self.config.TAR_TOP_FOLDER):
+            # only handle .tar files
+            if not tar_file.endswith(".tar"):
+                pass
+
+            tar_path = Path(self.config.TAR_TOP_FOLDER, tar_file)
+
+            # Create a folder for extracted tar contents
+            extract_folder = Path(
+                self.config.GZ_TOP_FOLDER, tar_file.replace(".tar", "")
+            )
+            Path(extract_folder).mkdir(exist_ok=True)
+
+            # Extract .tar file
+            with tarfile.open(tar_path, "r") as tar:
+                tar.extractall(path=extract_folder)
+
+            if self.config.delete_tar_after_processing:
+                os.remove(tar_path)
+
+    def _extract_gz(self):
+        for root, _, files in os.walk(self.config.GZ_TOP_FOLDER):
+            for file in files:
+                # only handle .gz files
+                if not file.endswith(".dat.gz"):
+                    pass  # adjust if extension differs
+                gz_path = Path(root, file)
+                dat_path = Path(self.config.DAT_TOP_FOLDER, file.replace(".gz", ""))
+
+                # Unzip .gz file
+                with gzip.open(gz_path, "rb") as f_in:
+                    with open(dat_path, "wb") as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+
+                if self.config.delete_gz_after_processing:
+                    os.remove(gz_path)
+
+        try:
+            shutil.rmtree(self.config.GZ_TOP_FOLDER)
+            print("processing complete and GZ files deleted")
+        except Exception as e:
+            print(str(e))
+            print(
+                f"processing complete but GZ folder delete failed. Please delete manually ({self.config.GZ_TOP_FOLDER})"
+            )
+
+    def run_extraction(self):
+        self._extract_tar()
+        self._extract_gz()
diff --git a/pyproject.toml b/pyproject.toml
index 1f10b49..c8b4255 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "met-office"
-version = "1.1.1"
+version = "1.2.0"
 description = "Convert .dat nimrod files to .asc files"
 readme = "README.md"
 requires-python = ">=3.14"
diff --git a/uv.lock b/uv.lock
index 127efef..e63ab11 100644
--- a/uv.lock
+++ b/uv.lock
@@ -4,7 +4,7 @@ requires-python = ">=3.14"
 
 [[package]]
 name = "met-office"
-version = "1.1.1"
+version = "1.2.0"
 source = { virtual = "." }
 dependencies = [
     { name = "numpy" },