70 lines
2.7 KiB
Python
70 lines
2.7 KiB
Python
import polars as pd
|
|
import os
|
|
import logging
|
|
|
|
class CombineTimeseries:
|
|
def __init__(self, config, locations):
|
|
self.config = config
|
|
self.locations = locations
|
|
self.grouped_locations = {}
|
|
self.build_location_groups()
|
|
|
|
def build_location_groups(self):
|
|
for location in self.locations:
|
|
group = location[3] # zone number
|
|
if group not in self.grouped_locations:
|
|
self.grouped_locations[group] = []
|
|
self.grouped_locations[group].append(location)
|
|
logging.info(f'Count of zones: {len(self.grouped_locations)}')
|
|
|
|
# def combine_csv_files(self):
|
|
# to_delete = []
|
|
# for group, loc_list in self.grouped_locations.items():
|
|
# output_file =f"{self.config.COMBINED_FOLDER}/zone_{group}_timeseries_data.csv"
|
|
# combined_df = None
|
|
# for loc in loc_list:
|
|
# csv_to_load = f"{self.config.CSV_TOP_FOLDER}/{loc[0]}_timeseries_data.csv"
|
|
# df = pd.read_csv(csv_to_load, streaming=True)
|
|
# if combined_df is None:
|
|
# combined_df = df
|
|
# else:
|
|
# combined_df = combined_df.join(df, on='datetime')
|
|
|
|
# if self.config.delete_csv_after_combining:
|
|
# to_delete.append(csv_to_load)
|
|
|
|
# sorted_df = combined_df.sort('datetime')
|
|
# print(f'writing file to {output_file}')
|
|
# sorted_df.write_csv(output_file)
|
|
|
|
# if len(to_delete) > 0:
|
|
# for path in to_delete:
|
|
# print(f'deleting {path}')
|
|
# os.remove(path)
|
|
|
|
def combine_csv_files(self):
|
|
to_delete = []
|
|
for group, loc_list in self.grouped_locations.items():
|
|
output_file = f"{self.config.COMBINED_FOLDER}/zone_{group}_timeseries_data.csv"
|
|
|
|
# Use LazyFrame for memory-efficient processing
|
|
lazy_dfs = []
|
|
for loc in loc_list:
|
|
csv_to_load = f"{self.config.CSV_TOP_FOLDER}/{loc[0]}_timeseries_data.csv"
|
|
df = pd.scan_csv(csv_to_load) # Lazy read
|
|
lazy_dfs.append(df)
|
|
|
|
if self.config.delete_csv_after_combining:
|
|
to_delete.append(csv_to_load)
|
|
|
|
# Combine with LazyFrame operations
|
|
combined_lazy = pd.concat(lazy_dfs, how='align').collect(streaming=True) # Collect at the end
|
|
|
|
sorted_df = combined_lazy.sort('datetime')
|
|
print(f'writing file to {output_file}')
|
|
sorted_df.write_csv(output_file)
|
|
|
|
if len(to_delete) > 0:
|
|
for path in to_delete:
|
|
print(f'deleting {path}')
|
|
os.remove(path) |