Extraction streamlining (#3)

* feat: ✨ added the extraction process into the main multi threaded loop Also added a warning when the app finds existing CSV files in the combined folder * fix: 🐛 Fixed time calculations for ETA & Completion
2025-12-12 19:56:14 +00:00
parent ad6b31e644
commit a43edb1148
7 changed files with 221 additions and 82 deletions
@@ -3,6 +3,7 @@ import gzip
 import shutil
 import os
 from pathlib import Path
+import concurrent.futures


 class Extract:
@@ -10,12 +11,8 @@ class Extract:
    def __init__(self, Config):
        self.config = Config

-    def _extract_tar(self):
-        for tar_file in os.listdir(self.config.TAR_TOP_FOLDER):
-            # only handle .tar files
-            if not tar_file.endswith(".tar"):
-                pass
-
+    def extract_tar_batch(self, tar_files):
+        for tar_file in tar_files:
            tar_path = Path(self.config.TAR_TOP_FOLDER, tar_file)

            # Create a folder for extracted tar contents
@@ -31,32 +28,45 @@ class Extract:
            if self.config.delete_tar_after_processing:
                os.remove(tar_path)

-    def _extract_gz(self):
+    def process_single_gz(self, gz_path, dat_path):
+        try:
+            with gzip.open(gz_path, "rb") as f_in:
+                with open(dat_path, "wb") as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+
+            if self.config.delete_gz_after_processing:
+                os.remove(gz_path)
+        except Exception as e:
+            print(f"Error extracting {gz_path}: {e}")
+
+    def extract_gz_batch(self):
+        gz_tasks = []
        for root, _, files in os.walk(self.config.GZ_TOP_FOLDER):
            for file in files:
                # only handle .gz files
                if not file.endswith(".dat.gz"):
-                    pass  # adjust if extension differs
+                    continue
+
                gz_path = Path(root, file)
                dat_path = Path(self.config.DAT_TOP_FOLDER, file.replace(".gz", ""))
+                gz_tasks.append((gz_path, dat_path))

-                # Unzip .gz file
-                with gzip.open(gz_path, "rb") as f_in:
-                    with open(dat_path, "wb") as f_out:
-                        shutil.copyfileobj(f_in, f_out)
+        print(f"Extracting {len(gz_tasks)} gz files concurrently...")

-                if self.config.delete_gz_after_processing:
-                    os.remove(gz_path)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.process_single_gz, gz_path, dat_path)
+                for gz_path, dat_path in gz_tasks
+            ]
+            concurrent.futures.wait(futures)

        try:
            shutil.rmtree(self.config.GZ_TOP_FOLDER)
+            # Recreate the folder for the next batch
+            Path(self.config.GZ_TOP_FOLDER).mkdir(exist_ok=True)
            print("processing complete and GZ files deleted")
        except Exception as e:
            print(str(e))
            print(
                f"processing complete but GZ folder delete failed. Please delete manually ({self.config.GZ_TOP_FOLDER})"
            )
-
-    def run_extraction(self):
-        self._extract_tar()
-        self._extract_gz()
@@ -164,8 +164,8 @@ class GenerateTimeseries:
                executor.shutdown(wait=False, cancel_futures=True)
                raise

-    def write_results_to_csv(self, results, locations):
-        """Write extracted data to CSV files for each zone.
+    def append_results_to_csv(self, results, locations):
+        """Append extracted data to CSV files for each zone.

        Args:
            results (dict): Aggregated results {zone_id: {'dates': [], 'values': []}}
@@ -217,19 +217,41 @@ class GenerateTimeseries:

                df_dict[grid_square] = aligned_values

-            df = pd.DataFrame(df_dict)
+            new_df = pd.DataFrame(df_dict)

-            # Sort by datetime (already sorted)
-            sorted_df = df.sort("datetime")
-
-            # Format datetime column
-            sorted_df = sorted_df.with_columns(
+            # Format datetime column in new_df
+            new_df = new_df.with_columns(
                pd.col("datetime").dt.strftime("%Y-%m-%d %H:%M:%S")
            )

            output_path = (
                Path(self.config.COMBINED_FOLDER) / f"{zone_name}_timeseries_data.csv"
            )
-            sorted_df.write_csv(output_path, float_precision=4)

-        logging.info("All CSV files written.")
+            if output_path.exists():
+                # Load existing CSV
+                existing_df = pd.read_csv(output_path)
+
+                # Reorder new_df to match existing_df
+                new_df = new_df.select(existing_df.columns)
+
+                # Concatenate
+                combined_df = pd.concat([existing_df, new_df])
+                # Sort by datetime
+                combined_df = combined_df.sort("datetime")
+                # Write back
+                combined_df.write_csv(output_path, float_precision=4)
+            else:
+                # Write new CSV
+                # Sort columns to ensure deterministic order (datetime first)
+                cols = new_df.columns
+                cols.remove("datetime")
+                cols.sort()
+                sorted_cols = ["datetime"] + cols
+                new_df = new_df.select(sorted_cols)
+
+                # Sort by datetime (already sorted but good practice)
+                sorted_df = new_df.sort("datetime")
+                sorted_df.write_csv(output_path, float_precision=4)
+
+        logging.info("All CSV files updated.")