Merge pull request #2 from Jake-Pullen/extraction
feat: ✨ Extraction now part of the main workflow
This commit is contained in:
@@ -10,6 +10,8 @@ wheels/
|
|||||||
.venv
|
.venv
|
||||||
|
|
||||||
dat_other/*
|
dat_other/*
|
||||||
|
tar_files/*
|
||||||
|
gz_files/*
|
||||||
dat_files/*
|
dat_files/*
|
||||||
asc_files/*
|
asc_files/*
|
||||||
csv_files/*
|
csv_files/*
|
||||||
|
|||||||
@@ -9,16 +9,23 @@ The project consists of a main pipeline workflow that processes multiple modules
|
|||||||
- `main.py`: Main pipeline orchestrator that calls on the modules as needed
|
- `main.py`: Main pipeline orchestrator that calls on the modules as needed
|
||||||
- `batch_nimrod.py`: Module for batch processing multiple NIMROD files with configurable bounding boxes
|
- `batch_nimrod.py`: Module for batch processing multiple NIMROD files with configurable bounding boxes
|
||||||
- `generate_timeseries.py`: Module for extracting cropped rain data and creating rainfall timeseries
|
- `generate_timeseries.py`: Module for extracting cropped rain data and creating rainfall timeseries
|
||||||
|
- `extract.py`: Module for extracting the dat files from the .gz.tar files that are downloaded from source
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
### main.py
|
### main.py
|
||||||
|
|
||||||
- Orchestrates the entire workflow pipeline
|
- Orchestrates the entire workflow pipeline
|
||||||
|
- Uncompress the packed .gz.tar files to DAT files
|
||||||
- Processes DAT files to ASC format
|
- Processes DAT files to ASC format
|
||||||
- Generates timeseries data for specified locations
|
- Generates timeseries data for specified locations
|
||||||
- Combines grouped CSV files into consolidated datasets formatted for Infoworks ICM
|
- Combines grouped CSV files into consolidated datasets formatted for Infoworks ICM
|
||||||
|
|
||||||
|
### extract.py
|
||||||
|
|
||||||
|
- Converts all .gz.tar files first to 288 (1 day) of .gz files
|
||||||
|
- Converts all .gz files to .dat files ready for processing.
|
||||||
|
|
||||||
### batch_nimrod.py
|
### batch_nimrod.py
|
||||||
|
|
||||||
- Process multiple NIMROD dat files
|
- Process multiple NIMROD dat files
|
||||||
@@ -44,24 +51,28 @@ It is recommended to use UV for environment and package handling.
|
|||||||
|
|
||||||
1. Ensure all required packages are installed `uv sync`
|
1. Ensure all required packages are installed `uv sync`
|
||||||
1. Adjust the config.py file to match your needs.
|
1. Adjust the config.py file to match your needs.
|
||||||
1. Ensure your .dat files are in the DAT_TOP_FOLDER (as per config location)
|
1. Ensure your .gz.tar files are in the TAR_TOP_FOLDER (as per config location)
|
||||||
1. Ensure your zone csv files are in the ZONE_FOLDER (as per config location)
|
1. Ensure your zone csv files are in the ZONE_FOLDER (as per config location)
|
||||||
1. RunMain Pipeline `uv run main.py` Note that you will have to set your environment variable `PYTHON_GIL=0` first
|
1. RunMain Pipeline `uv run main.py` Note that you will have to set your environment variable `PYTHON_GIL=0` first
|
||||||
1. find the output in the COMBINED_FOLDER (as per config location)
|
1. find the output in the COMBINED_FOLDER (as per config location)
|
||||||
|
|
||||||
The main pipeline will:
|
The main pipeline will:
|
||||||
|
|
||||||
1. Process DAT files to ASC format if needed
|
1. Uncompress the .gz.tar files ready for processing
|
||||||
|
1. Process DAT files to ASC format
|
||||||
1. Generate timeseries data for specified locations
|
1. Generate timeseries data for specified locations
|
||||||
1. Combine grouped CSV files into consolidated datasets
|
1. Combine grouped locations into consolidated datasets
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
The `config.py` file defines folder paths:
|
The `config.py` file defines folder paths and file deletion options:
|
||||||
|
|
||||||
- DAT_TOP_FOLDER: "./dat_files"
|
- TAR_TOP_FOLDER = "./tar_files"
|
||||||
- ASC_TOP_FOLDER: "./asc_files"
|
- GZ_TOP_FOLDER = "./gz_files"
|
||||||
- COMBINED_FOLDER: "./combined_files"
|
- DAT_TOP_FOLDER = "./dat_files"
|
||||||
|
- ASC_TOP_FOLDER = "./asc_files"
|
||||||
|
- COMBINED_FOLDER = "./combined_files"
|
||||||
|
- ZONE_FOLDER = "./zone_inputs"
|
||||||
|
|
||||||
Example of how the zone csv files should look:
|
Example of how the zone csv files should look:
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
class Config:
|
class Config:
|
||||||
|
TAR_TOP_FOLDER = "./tar_files"
|
||||||
|
GZ_TOP_FOLDER = "./gz_files"
|
||||||
DAT_TOP_FOLDER = "./dat_files"
|
DAT_TOP_FOLDER = "./dat_files"
|
||||||
ASC_TOP_FOLDER = "./asc_files"
|
ASC_TOP_FOLDER = "./asc_files"
|
||||||
COMBINED_FOLDER = "./combined_files"
|
COMBINED_FOLDER = "./combined_files"
|
||||||
|
|
||||||
ZONE_FOLDER = "./zone_inputs"
|
ZONE_FOLDER = "./zone_inputs"
|
||||||
|
|
||||||
delete_dat_after_processing = False
|
delete_tar_after_processing = False
|
||||||
|
delete_gz_after_processing = True
|
||||||
|
delete_dat_after_processing = True
|
||||||
delete_asc_after_processing = True
|
delete_asc_after_processing = True
|
||||||
|
|||||||
@@ -6,12 +6,13 @@ import concurrent.futures
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from config import Config
|
from config import Config
|
||||||
from modules import BatchNimrod, GenerateTimeseries
|
from modules import BatchNimrod, GenerateTimeseries, Extract
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def process_pipeline(dat_file):
|
def process_pipeline(dat_file):
|
||||||
# 1. Process DAT to ASC
|
# 1. Process DAT to ASC
|
||||||
asc_file = batch._process_single_file(dat_file)
|
asc_file = batch._process_single_file(dat_file)
|
||||||
@@ -22,9 +23,21 @@ def process_pipeline(dat_file):
|
|||||||
file_results = timeseries.process_asc_file(asc_file, locations)
|
file_results = timeseries.process_asc_file(asc_file, locations)
|
||||||
return file_results
|
return file_results
|
||||||
|
|
||||||
|
|
||||||
|
def initialise_folders():
|
||||||
|
folder_list = [
|
||||||
|
Config.ASC_TOP_FOLDER,
|
||||||
|
Config.COMBINED_FOLDER,
|
||||||
|
Config.GZ_TOP_FOLDER,
|
||||||
|
Config.DAT_TOP_FOLDER,
|
||||||
|
Config.TAR_TOP_FOLDER,
|
||||||
|
]
|
||||||
|
for path in folder_list:
|
||||||
|
Path(path).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
os.makedirs(Path(Config.ASC_TOP_FOLDER), exist_ok=True)
|
initialise_folders()
|
||||||
os.makedirs(Path(Config.COMBINED_FOLDER), exist_ok=True)
|
|
||||||
|
|
||||||
locations = []
|
locations = []
|
||||||
zones = set()
|
zones = set()
|
||||||
@@ -44,6 +57,7 @@ if __name__ == "__main__":
|
|||||||
logging.info(f"Count of 1km Grids: {len(locations)}")
|
logging.info(f"Count of 1km Grids: {len(locations)}")
|
||||||
logging.info(f"Count of Zones: {len(zones)}")
|
logging.info(f"Count of Zones: {len(zones)}")
|
||||||
|
|
||||||
|
extraction = Extract(Config)
|
||||||
batch = BatchNimrod(Config)
|
batch = BatchNimrod(Config)
|
||||||
timeseries = GenerateTimeseries(Config, locations)
|
timeseries = GenerateTimeseries(Config, locations)
|
||||||
|
|
||||||
@@ -55,6 +69,9 @@ if __name__ == "__main__":
|
|||||||
# Initialize results structure
|
# Initialize results structure
|
||||||
results = {loc[0]: {"dates": [], "values": []} for loc in locations}
|
results = {loc[0]: {"dates": [], "values": []} for loc in locations}
|
||||||
|
|
||||||
|
logging.info("Extracting tar and gz files")
|
||||||
|
extraction.run_extraction()
|
||||||
|
|
||||||
# Get list of DAT files
|
# Get list of DAT files
|
||||||
dat_files = [
|
dat_files = [
|
||||||
f for f in os.listdir(Path(Config.DAT_TOP_FOLDER)) if not f.startswith(".")
|
f for f in os.listdir(Path(Config.DAT_TOP_FOLDER)) if not f.startswith(".")
|
||||||
|
|||||||
+2
-5
@@ -1,9 +1,6 @@
|
|||||||
from .nimrod import Nimrod
|
from .nimrod import Nimrod
|
||||||
from .batch_nimrod import BatchNimrod
|
from .batch_nimrod import BatchNimrod
|
||||||
from .generate_timeseries import GenerateTimeseries
|
from .generate_timeseries import GenerateTimeseries
|
||||||
|
from .extract import Extract
|
||||||
|
|
||||||
__all__ = [
|
__all__ = ["Nimrod", "BatchNimrod", "GenerateTimeseries", "Extract"]
|
||||||
"Nimrod",
|
|
||||||
"BatchNimrod",
|
|
||||||
"GenerateTimeseries",
|
|
||||||
]
|
|
||||||
|
|||||||
Executable
+62
@@ -0,0 +1,62 @@
|
|||||||
|
import tarfile
|
||||||
|
import gzip
|
||||||
|
import shutil
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
class Extract:
|
||||||
|
# Directory containing .tar files
|
||||||
|
def __init__(self, Config):
|
||||||
|
self.config = Config
|
||||||
|
|
||||||
|
def _extract_tar(self):
|
||||||
|
for tar_file in os.listdir(self.config.TAR_TOP_FOLDER):
|
||||||
|
# only handle .tar files
|
||||||
|
if not tar_file.endswith(".tar"):
|
||||||
|
pass
|
||||||
|
|
||||||
|
tar_path = Path(self.config.TAR_TOP_FOLDER, tar_file)
|
||||||
|
|
||||||
|
# Create a folder for extracted tar contents
|
||||||
|
extract_folder = Path(
|
||||||
|
self.config.GZ_TOP_FOLDER, tar_file.replace(".tar", "")
|
||||||
|
)
|
||||||
|
Path(extract_folder).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Extract .tar file
|
||||||
|
with tarfile.open(tar_path, "r") as tar:
|
||||||
|
tar.extractall(path=extract_folder)
|
||||||
|
|
||||||
|
if self.config.delete_tar_after_processing:
|
||||||
|
os.remove(tar_path)
|
||||||
|
|
||||||
|
def _extract_gz(self):
|
||||||
|
for root, _, files in os.walk(self.config.GZ_TOP_FOLDER):
|
||||||
|
for file in files:
|
||||||
|
# only handle .gz files
|
||||||
|
if not file.endswith(".dat.gz"):
|
||||||
|
pass # adjust if extension differs
|
||||||
|
gz_path = Path(root, file)
|
||||||
|
dat_path = Path(self.config.DAT_TOP_FOLDER, file.replace(".gz", ""))
|
||||||
|
|
||||||
|
# Unzip .gz file
|
||||||
|
with gzip.open(gz_path, "rb") as f_in:
|
||||||
|
with open(dat_path, "wb") as f_out:
|
||||||
|
shutil.copyfileobj(f_in, f_out)
|
||||||
|
|
||||||
|
if self.config.delete_gz_after_processing:
|
||||||
|
os.remove(gz_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
shutil.rmtree(self.config.GZ_TOP_FOLDER)
|
||||||
|
print("processing complete and GZ files deleted")
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
print(
|
||||||
|
f"processing complete but GZ folder delete failed. Please delete manually ({self.config.GZ_TOP_FOLDER})"
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_extraction(self):
|
||||||
|
self._extract_tar()
|
||||||
|
self._extract_gz()
|
||||||
+1
-1
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "met-office"
|
name = "met-office"
|
||||||
version = "1.1.1"
|
version = "1.2.0"
|
||||||
description = "Convert .dat nimrod files to .asc files"
|
description = "Convert .dat nimrod files to .asc files"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.14"
|
requires-python = ">=3.14"
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ requires-python = ">=3.14"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "met-office"
|
name = "met-office"
|
||||||
version = "1.1.1"
|
version = "1.2.0"
|
||||||
source = { virtual = "." }
|
source = { virtual = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "numpy" },
|
{ name = "numpy" },
|
||||||
|
|||||||
Reference in New Issue
Block a user