From d386317957ea86e9698301c434c42f239c863a9b Mon Sep 17 00:00:00 2001 From: Jake Pullen Date: Thu, 11 Dec 2025 08:47:29 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20=E2=9C=A8=20Extraction=20now=20part=20o?= =?UTF-8?q?f=20the=20main=20workflow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 ++ README.MD | 25 +++++++++++++----- config.py | 7 ++++- main.py | 23 ++++++++++++++--- modules/__init__.py | 7 ++--- modules/extract.py | 62 +++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- uv.lock | 2 +- 8 files changed, 112 insertions(+), 18 deletions(-) create mode 100755 modules/extract.py diff --git a/.gitignore b/.gitignore index 07c8c0f..5e03441 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,8 @@ wheels/ .venv dat_other/* +tar_files/* +gz_files/* dat_files/* asc_files/* csv_files/* diff --git a/README.MD b/README.MD index ba377a6..50fde5d 100644 --- a/README.MD +++ b/README.MD @@ -9,16 +9,23 @@ The project consists of a main pipeline workflow that processes multiple modules - `main.py`: Main pipeline orchestrator that calls on the modules as needed - `batch_nimrod.py`: Module for batch processing multiple NIMROD files with configurable bounding boxes - `generate_timeseries.py`: Module for extracting cropped rain data and creating rainfall timeseries +- `extract.py`: Module for extracting the dat files from the .gz.tar files that are downloaded from source ## Features ### main.py - Orchestrates the entire workflow pipeline +- Uncompress the packed .gz.tar files to DAT files - Processes DAT files to ASC format - Generates timeseries data for specified locations - Combines grouped CSV files into consolidated datasets formatted for Infoworks ICM +### extract.py + +- Converts all .gz.tar files first to 288 (1 day) of .gz files +- Converts all .gz files to .dat files ready for processing. + ### batch_nimrod.py - Process multiple NIMROD dat files @@ -44,24 +51,28 @@ It is recommended to use UV for environment and package handling. 1. Ensure all required packages are installed `uv sync` 1. Adjust the config.py file to match your needs. -1. Ensure your .dat files are in the DAT_TOP_FOLDER (as per config location) +1. Ensure your .gz.tar files are in the TAR_TOP_FOLDER (as per config location) 1. Ensure your zone csv files are in the ZONE_FOLDER (as per config location) 1. RunMain Pipeline `uv run main.py` Note that you will have to set your environment variable `PYTHON_GIL=0` first 1. find the output in the COMBINED_FOLDER (as per config location) The main pipeline will: -1. Process DAT files to ASC format if needed +1. Uncompress the .gz.tar files ready for processing +1. Process DAT files to ASC format 1. Generate timeseries data for specified locations -1. Combine grouped CSV files into consolidated datasets +1. Combine grouped locations into consolidated datasets ## Configuration -The `config.py` file defines folder paths: +The `config.py` file defines folder paths and file deletion options: -- DAT_TOP_FOLDER: "./dat_files" -- ASC_TOP_FOLDER: "./asc_files" -- COMBINED_FOLDER: "./combined_files" +- TAR_TOP_FOLDER = "./tar_files" +- GZ_TOP_FOLDER = "./gz_files" +- DAT_TOP_FOLDER = "./dat_files" +- ASC_TOP_FOLDER = "./asc_files" +- COMBINED_FOLDER = "./combined_files" +- ZONE_FOLDER = "./zone_inputs" Example of how the zone csv files should look: diff --git a/config.py b/config.py index 22bbd56..e5b008b 100644 --- a/config.py +++ b/config.py @@ -1,8 +1,13 @@ class Config: + TAR_TOP_FOLDER = "./tar_files" + GZ_TOP_FOLDER = "./gz_files" DAT_TOP_FOLDER = "./dat_files" ASC_TOP_FOLDER = "./asc_files" COMBINED_FOLDER = "./combined_files" + ZONE_FOLDER = "./zone_inputs" - delete_dat_after_processing = False + delete_tar_after_processing = False + delete_gz_after_processing = True + delete_dat_after_processing = True delete_asc_after_processing = True diff --git a/main.py b/main.py index b038ccc..fa6fa40 100644 --- a/main.py +++ b/main.py @@ -6,12 +6,13 @@ import concurrent.futures from pathlib import Path from config import Config -from modules import BatchNimrod, GenerateTimeseries +from modules import BatchNimrod, GenerateTimeseries, Extract logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) + def process_pipeline(dat_file): # 1. Process DAT to ASC asc_file = batch._process_single_file(dat_file) @@ -22,9 +23,21 @@ def process_pipeline(dat_file): file_results = timeseries.process_asc_file(asc_file, locations) return file_results + +def initialise_folders(): + folder_list = [ + Config.ASC_TOP_FOLDER, + Config.COMBINED_FOLDER, + Config.GZ_TOP_FOLDER, + Config.DAT_TOP_FOLDER, + Config.TAR_TOP_FOLDER, + ] + for path in folder_list: + Path(path).mkdir(exist_ok=True) + + if __name__ == "__main__": - os.makedirs(Path(Config.ASC_TOP_FOLDER), exist_ok=True) - os.makedirs(Path(Config.COMBINED_FOLDER), exist_ok=True) + initialise_folders() locations = [] zones = set() @@ -44,6 +57,7 @@ if __name__ == "__main__": logging.info(f"Count of 1km Grids: {len(locations)}") logging.info(f"Count of Zones: {len(zones)}") + extraction = Extract(Config) batch = BatchNimrod(Config) timeseries = GenerateTimeseries(Config, locations) @@ -55,6 +69,9 @@ if __name__ == "__main__": # Initialize results structure results = {loc[0]: {"dates": [], "values": []} for loc in locations} + logging.info("Extracting tar and gz files") + extraction.run_extraction() + # Get list of DAT files dat_files = [ f for f in os.listdir(Path(Config.DAT_TOP_FOLDER)) if not f.startswith(".") diff --git a/modules/__init__.py b/modules/__init__.py index 6745012..7fa12b2 100644 --- a/modules/__init__.py +++ b/modules/__init__.py @@ -1,9 +1,6 @@ from .nimrod import Nimrod from .batch_nimrod import BatchNimrod from .generate_timeseries import GenerateTimeseries +from .extract import Extract -__all__ = [ - "Nimrod", - "BatchNimrod", - "GenerateTimeseries", -] +__all__ = ["Nimrod", "BatchNimrod", "GenerateTimeseries", "Extract"] diff --git a/modules/extract.py b/modules/extract.py new file mode 100755 index 0000000..249b520 --- /dev/null +++ b/modules/extract.py @@ -0,0 +1,62 @@ +import tarfile +import gzip +import shutil +import os +from pathlib import Path + + +class Extract: + # Directory containing .tar files + def __init__(self, Config): + self.config = Config + + def _extract_tar(self): + for tar_file in os.listdir(self.config.TAR_TOP_FOLDER): + # only handle .tar files + if not tar_file.endswith(".tar"): + pass + + tar_path = Path(self.config.TAR_TOP_FOLDER, tar_file) + + # Create a folder for extracted tar contents + extract_folder = Path( + self.config.GZ_TOP_FOLDER, tar_file.replace(".tar", "") + ) + Path(extract_folder).mkdir(exist_ok=True) + + # Extract .tar file + with tarfile.open(tar_path, "r") as tar: + tar.extractall(path=extract_folder) + + if self.config.delete_tar_after_processing: + os.remove(tar_path) + + def _extract_gz(self): + for root, _, files in os.walk(self.config.GZ_TOP_FOLDER): + for file in files: + # only handle .gz files + if not file.endswith(".dat.gz"): + pass # adjust if extension differs + gz_path = Path(root, file) + dat_path = Path(self.config.DAT_TOP_FOLDER, file.replace(".gz", "")) + + # Unzip .gz file + with gzip.open(gz_path, "rb") as f_in: + with open(dat_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + if self.config.delete_gz_after_processing: + os.remove(gz_path) + + try: + shutil.rmtree(self.config.GZ_TOP_FOLDER) + print("processing complete and GZ files deleted") + except Exception as e: + print(str(e)) + print( + f"processing complete but GZ folder delete failed. Please delete manually ({self.config.GZ_TOP_FOLDER})" + ) + + def run_extraction(self): + self._extract_tar() + self._extract_gz() diff --git a/pyproject.toml b/pyproject.toml index 1f10b49..c8b4255 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "met-office" -version = "1.1.1" +version = "1.2.0" description = "Convert .dat nimrod files to .asc files" readme = "README.md" requires-python = ">=3.14" diff --git a/uv.lock b/uv.lock index 127efef..e63ab11 100644 --- a/uv.lock +++ b/uv.lock @@ -4,7 +4,7 @@ requires-python = ">=3.14" [[package]] name = "met-office" -version = "1.1.1" +version = "1.2.0" source = { virtual = "." } dependencies = [ { name = "numpy" },