From 59f459d4d076113738dc9f052ca543fe00a27788 Mon Sep 17 00:00:00 2001
From: Jake Pullen <hello@jake-is.me>
Date: Tue, 9 Dec 2025 16:29:48 +0000
Subject: [PATCH] =?UTF-8?q?feat:=20=E2=9C=A8=20Reduced=20the=20amount=20of?=
 =?UTF-8?q?=20steps=20and=20saved=20a=20lot=20of=20ram?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.MD                      |  1 -
 config.py                      |  6 ++--
 main.py                        |  6 ----
 modules/combine_timeseries.py  | 54 ----------------------------------
 modules/generate_timeseries.py | 34 ---------------------
 5 files changed, 2 insertions(+), 99 deletions(-)

diff --git a/README.MD b/README.MD
index 1a67d0f..c234d8e 100644
--- a/README.MD
+++ b/README.MD
@@ -66,7 +66,6 @@ The `config.py` file defines folder paths:
 
 - DAT_TOP_FOLDER: "./dat_files"
 - ASC_TOP_FOLDER: "./asc_files"
-- CSV_TOP_FOLDER: "./csv_files"
 - COMBINED_FOLDER: "./combined_files"
 
 Example of how the zone csv files should look:
diff --git a/config.py b/config.py
index fecadd0..4c11afd 100644
--- a/config.py
+++ b/config.py
@@ -1,10 +1,8 @@
 class Config:
     DAT_TOP_FOLDER = "./dat_files"
     ASC_TOP_FOLDER = "./asc_files"
-    CSV_TOP_FOLDER = "./csv_files"
     COMBINED_FOLDER = "./combined_files"
     ZONE_FOLDER = "./zone_inputs"
 
-    delete_dat_after_processing = False
-    delete_asc_after_processing = True
-    delete_csv_after_combining = True
\ No newline at end of file
+    delete_dat_after_processing = True
+    delete_asc_after_processing = True
\ No newline at end of file
diff --git a/main.py b/main.py
index 4f2d527..4f30814 100644
--- a/main.py
+++ b/main.py
@@ -14,7 +14,6 @@ logging.basicConfig(
 
 if __name__ == "__main__":
     os.makedirs(Path(Config.ASC_TOP_FOLDER), exist_ok=True)
-    #os.makedirs(Path(Config.CSV_TOP_FOLDER), exist_ok=True)
     os.makedirs(Path(Config.COMBINED_FOLDER), exist_ok=True)
 
     locations = []
@@ -92,11 +91,6 @@ if __name__ == "__main__":
 
     logging.info("Writing CSV files...")
     timeseries.write_results_to_csv(results, locations)
-    # results.clear()
-
-    # logging.info("combining CSVs into groups")
-    # combiner.combine_csv_files()
-    # logging.info("CSVs combined!")
     end = time.time()
     elapsed_time = end - start
 
diff --git a/modules/combine_timeseries.py b/modules/combine_timeseries.py
index a4b32c6..2671e5c 100644
--- a/modules/combine_timeseries.py
+++ b/modules/combine_timeseries.py
@@ -1,5 +1,3 @@
-import polars as pd
-import os
 import logging
 
 class CombineTimeseries:
@@ -16,55 +14,3 @@ class CombineTimeseries:
                 self.grouped_locations[group] = []
             self.grouped_locations[group].append(location)
         logging.info(f'Count of zones: {len(self.grouped_locations)}')
-
-    # def combine_csv_files(self):
-    #     to_delete = []
-    #     for group, loc_list in self.grouped_locations.items():
-    #         output_file =f"{self.config.COMBINED_FOLDER}/zone_{group}_timeseries_data.csv"
-    #         combined_df = None
-    #         for loc in loc_list:
-    #             csv_to_load = f"{self.config.CSV_TOP_FOLDER}/{loc[0]}_timeseries_data.csv"
-    #             df = pd.read_csv(csv_to_load, streaming=True)
-    #             if combined_df is None:
-    #                 combined_df = df
-    #             else:
-    #                 combined_df = combined_df.join(df, on='datetime')
-
-    #             if self.config.delete_csv_after_combining:
-    #                 to_delete.append(csv_to_load)
-
-    #         sorted_df = combined_df.sort('datetime')
-    #         print(f'writing file to {output_file}')
-    #         sorted_df.write_csv(output_file)
-
-    #     if len(to_delete) > 0:
-    #         for path in to_delete:
-    #             print(f'deleting {path}')
-    #             os.remove(path)
-
-    def combine_csv_files(self):  
-        to_delete = []  
-        for group, loc_list in self.grouped_locations.items():  
-            output_file = f"{self.config.COMBINED_FOLDER}/zone_{group}_timeseries_data.csv"  
-            
-            # Use LazyFrame for memory-efficient processing
-            lazy_dfs = []
-            for loc in loc_list:  
-                csv_to_load = f"{self.config.CSV_TOP_FOLDER}/{loc[0]}_timeseries_data.csv"  
-                df = pd.scan_csv(csv_to_load)  # Lazy read
-                lazy_dfs.append(df)
-
-                if self.config.delete_csv_after_combining:
-                    to_delete.append(csv_to_load)
-
-            # Combine with LazyFrame operations
-            combined_lazy = pd.concat(lazy_dfs, how='align').collect(streaming=True)  # Collect at the end
-
-            sorted_df = combined_lazy.sort('datetime')
-            print(f'writing file to {output_file}')  
-            sorted_df.write_csv(output_file)  
-    
-        if len(to_delete) > 0:  
-            for path in to_delete:  
-                print(f'deleting {path}')  
-                os.remove(path)
\ No newline at end of file
diff --git a/modules/generate_timeseries.py b/modules/generate_timeseries.py
index 171bc5a..09b5847 100644
--- a/modules/generate_timeseries.py
+++ b/modules/generate_timeseries.py
@@ -144,7 +144,6 @@ class GenerateTimeseries:
 
         # Use ThreadPoolExecutor for concurrent processing
         # Since we are using Python 3.14t (free-threaded), this should scale well even for CPU work
-        # mixed with I/O.
         with concurrent.futures.ThreadPoolExecutor() as executor:
             # Submit all tasks
             future_to_file = {
@@ -170,39 +169,6 @@ class GenerateTimeseries:
                 executor.shutdown(wait=False, cancel_futures=True)
                 raise
 
-    # def write_results_to_csv(self, results, locations):
-    #     """Write extracted data to CSV files for each location.
-
-    #     Args:
-    #         results (dict): Aggregated results {zone_id: {'dates': [], 'values': []}}
-    #         locations (list): List of location data
-    #     """
-    #     for location in locations:
-    #         grid_square = location[0]
-    #         zone = location[3]
-    #         data = results[grid_square]
-            
-    #         if not data['dates']:
-    #             print(f"No data found for {grid_square}")
-    #             continue
-
-    #         df = pd.DataFrame({"datetime": data['dates'], grid_square: data['values']})
-
-    #         # Sort the dataframe into date order
-    #         sorted_df = df.sort("datetime")
-            
-    #         # Format datetime column
-    #         sorted_df = sorted_df.with_columns(
-    #             pd.col("datetime").dt.strftime("%Y-%m-%d %H:%M:%S")
-    #         )
-
-    #         output_path = Path(self.config.CSV_TOP_FOLDER) / f"{zone}_timeseries_data.csv"
-    #         sorted_df.write_csv(
-    #             output_path,
-    #             float_precision=4
-    #         )
-    #     logging.info("All CSV files written.")
-
     def write_results_to_csv(self, results, locations):
         """Write extracted data to CSV files for each zone.