feat: Reduced the amount of steps and saved a lot of ram

This commit is contained in:
2025-12-09 16:29:48 +00:00
parent 84ba6c837c
commit 59f459d4d0
5 changed files with 2 additions and 99 deletions
-54
View File
@@ -1,5 +1,3 @@
import polars as pd
import os
import logging
class CombineTimeseries:
@@ -16,55 +14,3 @@ class CombineTimeseries:
self.grouped_locations[group] = []
self.grouped_locations[group].append(location)
logging.info(f'Count of zones: {len(self.grouped_locations)}')
# def combine_csv_files(self):
# to_delete = []
# for group, loc_list in self.grouped_locations.items():
# output_file =f"{self.config.COMBINED_FOLDER}/zone_{group}_timeseries_data.csv"
# combined_df = None
# for loc in loc_list:
# csv_to_load = f"{self.config.CSV_TOP_FOLDER}/{loc[0]}_timeseries_data.csv"
# df = pd.read_csv(csv_to_load, streaming=True)
# if combined_df is None:
# combined_df = df
# else:
# combined_df = combined_df.join(df, on='datetime')
# if self.config.delete_csv_after_combining:
# to_delete.append(csv_to_load)
# sorted_df = combined_df.sort('datetime')
# print(f'writing file to {output_file}')
# sorted_df.write_csv(output_file)
# if len(to_delete) > 0:
# for path in to_delete:
# print(f'deleting {path}')
# os.remove(path)
def combine_csv_files(self):
to_delete = []
for group, loc_list in self.grouped_locations.items():
output_file = f"{self.config.COMBINED_FOLDER}/zone_{group}_timeseries_data.csv"
# Use LazyFrame for memory-efficient processing
lazy_dfs = []
for loc in loc_list:
csv_to_load = f"{self.config.CSV_TOP_FOLDER}/{loc[0]}_timeseries_data.csv"
df = pd.scan_csv(csv_to_load) # Lazy read
lazy_dfs.append(df)
if self.config.delete_csv_after_combining:
to_delete.append(csv_to_load)
# Combine with LazyFrame operations
combined_lazy = pd.concat(lazy_dfs, how='align').collect(streaming=True) # Collect at the end
sorted_df = combined_lazy.sort('datetime')
print(f'writing file to {output_file}')
sorted_df.write_csv(output_file)
if len(to_delete) > 0:
for path in to_delete:
print(f'deleting {path}')
os.remove(path)
-34
View File
@@ -144,7 +144,6 @@ class GenerateTimeseries:
# Use ThreadPoolExecutor for concurrent processing
# Since we are using Python 3.14t (free-threaded), this should scale well even for CPU work
# mixed with I/O.
with concurrent.futures.ThreadPoolExecutor() as executor:
# Submit all tasks
future_to_file = {
@@ -170,39 +169,6 @@ class GenerateTimeseries:
executor.shutdown(wait=False, cancel_futures=True)
raise
# def write_results_to_csv(self, results, locations):
# """Write extracted data to CSV files for each location.
# Args:
# results (dict): Aggregated results {zone_id: {'dates': [], 'values': []}}
# locations (list): List of location data
# """
# for location in locations:
# grid_square = location[0]
# zone = location[3]
# data = results[grid_square]
# if not data['dates']:
# print(f"No data found for {grid_square}")
# continue
# df = pd.DataFrame({"datetime": data['dates'], grid_square: data['values']})
# # Sort the dataframe into date order
# sorted_df = df.sort("datetime")
# # Format datetime column
# sorted_df = sorted_df.with_columns(
# pd.col("datetime").dt.strftime("%Y-%m-%d %H:%M:%S")
# )
# output_path = Path(self.config.CSV_TOP_FOLDER) / f"{zone}_timeseries_data.csv"
# sorted_df.write_csv(
# output_path,
# float_precision=4
# )
# logging.info("All CSV files written.")
def write_results_to_csv(self, results, locations):
"""Write extracted data to CSV files for each zone.