Now deleting existing combined csv files after confirmation at start.

2025-12-15 10:13:11 +00:00
3 changed files with 37 additions and 25 deletions
@@ -17,7 +17,6 @@ The project consists of a main pipeline workflow that processes multiple modules

 - **Startup Safety Check**: Scans the `COMBINED_FOLDER` at startup and warns the user if existing files are found, Deleting existing files if continue is accepted.
 - **Batch Processing**: Processes input tar files in configurable batches to manage resource usage.
- **Tidy by Default**: Default settings wil delete all mid step files and keep only the original Tar files. Can be changed in config.py
 - **End-to-End Processing**: Extracts GZ files, processes DAT/ASC, and appends to CSV in a single thread per file.
 - **Concurrency**: Uses multi-threading to process individual GZ files within a batch concurrently.
 - **Cumulative Data**: Automatically appends new query results to the existing CSV files in `COMBINED_FOLDER` for each batch, ensuring no data is lost and columns are correctly aligned.
@@ -32,7 +31,7 @@ The project consists of a main pipeline workflow that processes multiple modules

 - Process multiple NIMROD dat files
 - Automatically extract datetime from file data
- Export raster data to ASC format
+- Export clipped raster data to ASC format

 ### generate_timeseries.py

@@ -100,9 +100,7 @@ if __name__ == "__main__":
            logging.info("Aborting...")
            exit(0)
        else:
-            shutil.rmtree(
-                Path(Config.COMBINED_FOLDER)
-            )  # Delete everything including the directory
+            shutil.rmtree(Path(Config.COMBINED_FOLDER))  # Delete everything including the directory
            Path(Config.COMBINED_FOLDER).mkdir()

    extraction = Extract(Config)
@@ -217,8 +215,6 @@ if __name__ == "__main__":
    elif elapsed_time < 3600:
        elapsed_time_str = f"{int(elapsed_time // 60)}m {int(elapsed_time % 60)}s"
    else:
-        elapsed_time_str = (
-            f"{int(elapsed_time // 3600)}h {int((elapsed_time % 3600) // 60)}m"
-        )
+        elapsed_time_str = f"{int(elapsed_time // 3600)}h {int((elapsed_time % 3600) // 60)}m"

    logging.info(f"All Complete total time {elapsed_time_str}")
@@ -56,11 +56,11 @@ class GenerateTimeseries:
        xpp = ncols_basin * cellres_basin
        ypp = nrows_basin * cellres_basin

-        start_col = np.floor(xp / cellres_radar) - 1
-        end_col = np.ceil((xpp + xp) / cellres_radar) - 1
+        start_col = np.floor(xp / cellres_radar)
+        end_col = np.ceil((xpp + xp) / cellres_radar)

-        start_row = np.floor(nrows_radar - ((yp + ypp) / cellres_radar)) + 1
-        end_row = np.ceil(nrows_radar - (yp / cellres_radar)) + 1
+        start_row = np.floor(nrows_radar - ((yp + ypp) / cellres_radar))
+        end_row = np.ceil(nrows_radar - (yp / cellres_radar))

        return int(start_col), int(start_row), int(end_col), int(end_row)

@@ -178,26 +178,43 @@ class GenerateTimeseries:
            zone_name = loc[3]

            if zone_name not in zone_data:
-                zone_data[zone_name] = {"dates": set(), "values": {}}
+                zone_data[zone_name] = {"dates": [], "values": {}}

-            # Create date -> value map for this grid square
-            raw_dates = results[zone_id]["dates"]
-            raw_values = results[zone_id]["values"]
-            date_value_map = dict(zip(raw_dates, raw_values))
+            zone_data[zone_name]["values"][zone_id] = results[zone_id]["values"]
+            zone_data[zone_name]["dates"].extend(results[zone_id]["dates"])

-            zone_data[zone_name]["values"][zone_id] = date_value_map
-            zone_data[zone_name]["dates"].update(raw_dates)
+        # Get unique sorted dates across all zones
+        for zone_name, data in zone_data.items():
+            data["dates"] = sorted(set(data["dates"]))

        # Now write one CSV per zone with aligned timestamps
        for zone_name, data in zone_data.items():
-            sorted_dates = sorted(data["dates"])
+            dates = data["dates"]
            values_dict = data["values"]

            # Create aligned DataFrame
-            df_dict = {"datetime": sorted_dates}
-            for grid_square, dv_map in values_dict.items():
-                # Align values to the common search dates using the map
-                aligned_values = [dv_map.get(d) for d in sorted_dates]
+            df_dict = {"datetime": dates}
+            for grid_square, values in values_dict.items():
+                # Align values to the common dates
+                aligned_values = []
+                value_iter = iter(values)
+                date_iter = iter(dates)
+
+                current_date = next(date_iter, None)
+                current_value = next(value_iter, None)
+
+                for expected_date in dates:
+                    if current_date == expected_date:
+                        aligned_values.append(current_value)
+                        try:
+                            current_date = next(date_iter)
+                            current_value = next(value_iter)
+                        except StopIteration:
+                            current_date = None
+                            current_value = None
+                    else:
+                        aligned_values.append(None)  # Missing value
+
                df_dict[grid_square] = aligned_values

            new_df = pd.DataFrame(df_dict)