Extraction streamlining (#3)
* feat: ✨ added the extraction process into the main multi threaded loop Also added a warning when the app finds existing CSV files in the combined folder * fix: 🐛 Fixed time calculations for ETA & Completion
This commit is contained in:
+28
-18
@@ -3,6 +3,7 @@ import gzip
|
||||
import shutil
|
||||
import os
|
||||
from pathlib import Path
|
||||
import concurrent.futures
|
||||
|
||||
|
||||
class Extract:
|
||||
@@ -10,12 +11,8 @@ class Extract:
|
||||
def __init__(self, Config):
|
||||
self.config = Config
|
||||
|
||||
def _extract_tar(self):
|
||||
for tar_file in os.listdir(self.config.TAR_TOP_FOLDER):
|
||||
# only handle .tar files
|
||||
if not tar_file.endswith(".tar"):
|
||||
pass
|
||||
|
||||
def extract_tar_batch(self, tar_files):
|
||||
for tar_file in tar_files:
|
||||
tar_path = Path(self.config.TAR_TOP_FOLDER, tar_file)
|
||||
|
||||
# Create a folder for extracted tar contents
|
||||
@@ -31,32 +28,45 @@ class Extract:
|
||||
if self.config.delete_tar_after_processing:
|
||||
os.remove(tar_path)
|
||||
|
||||
def _extract_gz(self):
|
||||
def process_single_gz(self, gz_path, dat_path):
|
||||
try:
|
||||
with gzip.open(gz_path, "rb") as f_in:
|
||||
with open(dat_path, "wb") as f_out:
|
||||
shutil.copyfileobj(f_in, f_out)
|
||||
|
||||
if self.config.delete_gz_after_processing:
|
||||
os.remove(gz_path)
|
||||
except Exception as e:
|
||||
print(f"Error extracting {gz_path}: {e}")
|
||||
|
||||
def extract_gz_batch(self):
|
||||
gz_tasks = []
|
||||
for root, _, files in os.walk(self.config.GZ_TOP_FOLDER):
|
||||
for file in files:
|
||||
# only handle .gz files
|
||||
if not file.endswith(".dat.gz"):
|
||||
pass # adjust if extension differs
|
||||
continue
|
||||
|
||||
gz_path = Path(root, file)
|
||||
dat_path = Path(self.config.DAT_TOP_FOLDER, file.replace(".gz", ""))
|
||||
gz_tasks.append((gz_path, dat_path))
|
||||
|
||||
# Unzip .gz file
|
||||
with gzip.open(gz_path, "rb") as f_in:
|
||||
with open(dat_path, "wb") as f_out:
|
||||
shutil.copyfileobj(f_in, f_out)
|
||||
print(f"Extracting {len(gz_tasks)} gz files concurrently...")
|
||||
|
||||
if self.config.delete_gz_after_processing:
|
||||
os.remove(gz_path)
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = [
|
||||
executor.submit(self.process_single_gz, gz_path, dat_path)
|
||||
for gz_path, dat_path in gz_tasks
|
||||
]
|
||||
concurrent.futures.wait(futures)
|
||||
|
||||
try:
|
||||
shutil.rmtree(self.config.GZ_TOP_FOLDER)
|
||||
# Recreate the folder for the next batch
|
||||
Path(self.config.GZ_TOP_FOLDER).mkdir(exist_ok=True)
|
||||
print("processing complete and GZ files deleted")
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
print(
|
||||
f"processing complete but GZ folder delete failed. Please delete manually ({self.config.GZ_TOP_FOLDER})"
|
||||
)
|
||||
|
||||
def run_extraction(self):
|
||||
self._extract_tar()
|
||||
self._extract_gz()
|
||||
|
||||
@@ -164,8 +164,8 @@ class GenerateTimeseries:
|
||||
executor.shutdown(wait=False, cancel_futures=True)
|
||||
raise
|
||||
|
||||
def write_results_to_csv(self, results, locations):
|
||||
"""Write extracted data to CSV files for each zone.
|
||||
def append_results_to_csv(self, results, locations):
|
||||
"""Append extracted data to CSV files for each zone.
|
||||
|
||||
Args:
|
||||
results (dict): Aggregated results {zone_id: {'dates': [], 'values': []}}
|
||||
@@ -217,19 +217,41 @@ class GenerateTimeseries:
|
||||
|
||||
df_dict[grid_square] = aligned_values
|
||||
|
||||
df = pd.DataFrame(df_dict)
|
||||
new_df = pd.DataFrame(df_dict)
|
||||
|
||||
# Sort by datetime (already sorted)
|
||||
sorted_df = df.sort("datetime")
|
||||
|
||||
# Format datetime column
|
||||
sorted_df = sorted_df.with_columns(
|
||||
# Format datetime column in new_df
|
||||
new_df = new_df.with_columns(
|
||||
pd.col("datetime").dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
)
|
||||
|
||||
output_path = (
|
||||
Path(self.config.COMBINED_FOLDER) / f"{zone_name}_timeseries_data.csv"
|
||||
)
|
||||
sorted_df.write_csv(output_path, float_precision=4)
|
||||
|
||||
logging.info("All CSV files written.")
|
||||
if output_path.exists():
|
||||
# Load existing CSV
|
||||
existing_df = pd.read_csv(output_path)
|
||||
|
||||
# Reorder new_df to match existing_df
|
||||
new_df = new_df.select(existing_df.columns)
|
||||
|
||||
# Concatenate
|
||||
combined_df = pd.concat([existing_df, new_df])
|
||||
# Sort by datetime
|
||||
combined_df = combined_df.sort("datetime")
|
||||
# Write back
|
||||
combined_df.write_csv(output_path, float_precision=4)
|
||||
else:
|
||||
# Write new CSV
|
||||
# Sort columns to ensure deterministic order (datetime first)
|
||||
cols = new_df.columns
|
||||
cols.remove("datetime")
|
||||
cols.sort()
|
||||
sorted_cols = ["datetime"] + cols
|
||||
new_df = new_df.select(sorted_cols)
|
||||
|
||||
# Sort by datetime (already sorted but good practice)
|
||||
sorted_df = new_df.sort("datetime")
|
||||
sorted_df.write_csv(output_path, float_precision=4)
|
||||
|
||||
logging.info("All CSV files updated.")
|
||||
|
||||
Reference in New Issue
Block a user