exploring options
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import polars as pd
|
||||
import os
|
||||
|
||||
import logging
|
||||
|
||||
class CombineTimeseries:
|
||||
def __init__(self, config, locations):
|
||||
@@ -15,23 +15,56 @@ class CombineTimeseries:
|
||||
if group not in self.grouped_locations:
|
||||
self.grouped_locations[group] = []
|
||||
self.grouped_locations[group].append(location)
|
||||
logging.info(f'Count of zones: {len(self.grouped_locations)}')
|
||||
|
||||
def combine_csv_files(self):
|
||||
for group, loc_list in self.grouped_locations.items():
|
||||
combined_df = None
|
||||
for loc in loc_list:
|
||||
csv_to_load = f"./csv_files/{loc[0]}_timeseries_data.csv"
|
||||
df = pd.read_csv(csv_to_load)
|
||||
if combined_df is None:
|
||||
combined_df = df
|
||||
else:
|
||||
combined_df = combined_df.join(df, on='datetime')
|
||||
# def combine_csv_files(self):
|
||||
# to_delete = []
|
||||
# for group, loc_list in self.grouped_locations.items():
|
||||
# output_file =f"{self.config.COMBINED_FOLDER}/zone_{group}_timeseries_data.csv"
|
||||
# combined_df = None
|
||||
# for loc in loc_list:
|
||||
# csv_to_load = f"{self.config.CSV_TOP_FOLDER}/{loc[0]}_timeseries_data.csv"
|
||||
# df = pd.read_csv(csv_to_load, streaming=True)
|
||||
# if combined_df is None:
|
||||
# combined_df = df
|
||||
# else:
|
||||
# combined_df = combined_df.join(df, on='datetime')
|
||||
|
||||
# if self.config.delete_csv_after_combining:
|
||||
# to_delete.append(csv_to_load)
|
||||
|
||||
# sorted_df = combined_df.sort('datetime')
|
||||
# print(f'writing file to {output_file}')
|
||||
# sorted_df.write_csv(output_file)
|
||||
|
||||
# if len(to_delete) > 0:
|
||||
# for path in to_delete:
|
||||
# print(f'deleting {path}')
|
||||
# os.remove(path)
|
||||
|
||||
def combine_csv_files(self):
|
||||
to_delete = []
|
||||
for group, loc_list in self.grouped_locations.items():
|
||||
output_file = f"{self.config.COMBINED_FOLDER}/zone_{group}_timeseries_data.csv"
|
||||
|
||||
# Use LazyFrame for memory-efficient processing
|
||||
lazy_dfs = []
|
||||
for loc in loc_list:
|
||||
csv_to_load = f"{self.config.CSV_TOP_FOLDER}/{loc[0]}_timeseries_data.csv"
|
||||
df = pd.scan_csv(csv_to_load) # Lazy read
|
||||
lazy_dfs.append(df)
|
||||
|
||||
if self.config.delete_csv_after_combining:
|
||||
os.remove(csv_to_load)
|
||||
to_delete.append(csv_to_load)
|
||||
|
||||
output_file = (
|
||||
f"{self.config.COMBINED_FOLDER}/zone_{group}_timeseries_data.csv"
|
||||
)
|
||||
sorted_df = combined_df.sort('datetime')
|
||||
sorted_df.write_csv(output_file)
|
||||
# Combine with LazyFrame operations
|
||||
combined_lazy = pd.concat(lazy_dfs, how='align').collect(streaming=True) # Collect at the end
|
||||
|
||||
sorted_df = combined_lazy.sort('datetime')
|
||||
print(f'writing file to {output_file}')
|
||||
sorted_df.write_csv(output_file)
|
||||
|
||||
if len(to_delete) > 0:
|
||||
for path in to_delete:
|
||||
print(f'deleting {path}')
|
||||
os.remove(path)
|
||||
@@ -5,6 +5,7 @@ import polars as pd
|
||||
from datetime import datetime
|
||||
import os
|
||||
import concurrent.futures
|
||||
import logging
|
||||
|
||||
|
||||
|
||||
@@ -169,7 +170,6 @@ class GenerateTimeseries:
|
||||
executor.shutdown(wait=False, cancel_futures=True)
|
||||
raise
|
||||
|
||||
# Write CSVs for each location
|
||||
def write_results_to_csv(self, results, locations):
|
||||
"""Write extracted data to CSV files for each location.
|
||||
|
||||
@@ -177,7 +177,6 @@ class GenerateTimeseries:
|
||||
results (dict): Aggregated results {zone_id: {'dates': [], 'values': []}}
|
||||
locations (list): List of location data
|
||||
"""
|
||||
print("Writing CSV files...")
|
||||
for location in locations:
|
||||
zone_id = location[0]
|
||||
data = results[zone_id]
|
||||
@@ -201,4 +200,4 @@ class GenerateTimeseries:
|
||||
output_path,
|
||||
float_precision=4
|
||||
)
|
||||
print("All CSV files written.")
|
||||
logging.info("All CSV files written.")
|
||||
|
||||
Reference in New Issue
Block a user