chore: 🧹 Ruff clean up

fix: 🐞 Fixed an ordering issue when saving to CSV
docs: 📜 ReadMe Clarity tweaks
2025-12-24 15:32:41 +00:00 · 2025-12-24 15:31:36 +00:00 · 2025-12-17 09:54:46 +00:00 · 2025-12-15 10:17:27 +00:00 · 2025-12-12 19:56:14 +00:00
5 changed files with 41 additions and 49 deletions
@@ -15,8 +15,9 @@ The project consists of a main pipeline workflow that processes multiple modules

 ### main.py

- **Startup Safety Check**: Scans the `COMBINED_FOLDER` at startup and warns the user if existing files are found, offering a chance to abort to prevent accidental data mixing.
+- **Startup Safety Check**: Scans the `COMBINED_FOLDER` at startup and warns the user if existing files are found, Deleting existing files if continue is accepted.
 - **Batch Processing**: Processes input tar files in configurable batches to manage resource usage.
+- **Tidy by Default**: Default settings wil delete all mid step files and keep only the original Tar files. Can be changed in config.py
 - **End-to-End Processing**: Extracts GZ files, processes DAT/ASC, and appends to CSV in a single thread per file.
 - **Concurrency**: Uses multi-threading to process individual GZ files within a batch concurrently.
 - **Cumulative Data**: Automatically appends new query results to the existing CSV files in `COMBINED_FOLDER` for each batch, ensuring no data is lost and columns are correctly aligned.
@@ -31,7 +32,7 @@ The project consists of a main pipeline workflow that processes multiple modules

 - Process multiple NIMROD dat files
 - Automatically extract datetime from file data
- Export clipped raster data to ASC format
+- Export raster data to ASC format

 ### generate_timeseries.py

@@ -92,13 +92,18 @@ if __name__ == "__main__":
            f"Found {len(existing_combined)} files in {Config.COMBINED_FOLDER}"
        )
        logging.warning(
-            "You may want to remove these before continuing to avoid duplicates or messy data."
+            "If you continue these WILL BE DELETED, Please make sure you have them saved."
        )
        logging.warning("!" * 80)
        response = input("Continue? (Y/N): ").strip().lower()
        if response != "y":
            logging.info("Aborting...")
            exit(0)
+        else:
+            shutil.rmtree(
+                Path(Config.COMBINED_FOLDER)
+            )  # Delete everything including the directory
+            Path(Config.COMBINED_FOLDER).mkdir()

    extraction = Extract(Config)
    batch = BatchNimrod(Config)
@@ -130,12 +135,6 @@ if __name__ == "__main__":
        # 1. Extract batch (TAR -> GZ)
        logging.info("Extracting tar files for batch")
        extraction.extract_tar_batch(batch_files)
-        # Note: We do NOT run extract_gz_batch anymore. We will find GZ files and process them.
-
-        # Get list of GZ files (recursively or flat?)
-        # extract_tar_batch puts them in GZ_TOP_FOLDER/tar_name_without_ext
-        # So we need to look there.
-        # Ideally we know where we put them.

        gz_files_to_process = []
        for tar_file in batch_files:
@@ -167,14 +166,14 @@ if __name__ == "__main__":

                    completed_count += 1
                    if completed_count % 100 == 0:
-                        elapsed_time = time.time() - start
-                        rate_per_second = completed_count / elapsed_time
-
                        files_processed_previous = i * files_per_tar
                        files_processed_so_far = (
                            files_processed_previous + completed_count
                        )

+                        elapsed_time = time.time() - start
+                        rate_per_second = files_processed_so_far / elapsed_time
+
                        remaining_files = estimated_total_files - files_processed_so_far

                        if rate_per_second > 0:
@@ -213,4 +212,13 @@ if __name__ == "__main__":
    end = time.time()
    elapsed_time = end - start

-    logging.info(f"All Complete total time {elapsed_time:.2f} seconds")
+    if elapsed_time < 60:
+        elapsed_time_str = f"{int(elapsed_time)}s"
+    elif elapsed_time < 3600:
+        elapsed_time_str = f"{int(elapsed_time // 60)}m {int(elapsed_time % 60)}s"
+    else:
+        elapsed_time_str = (
+            f"{int(elapsed_time // 3600)}h {int((elapsed_time % 3600) // 60)}m"
+        )
+
+    logging.info(f"All Complete total time {elapsed_time_str}")
@@ -56,11 +56,11 @@ class GenerateTimeseries:
        xpp = ncols_basin * cellres_basin
        ypp = nrows_basin * cellres_basin

-        start_col = np.floor(xp / cellres_radar)
-        end_col = np.ceil((xpp + xp) / cellres_radar)
+        start_col = np.floor(xp / cellres_radar) - 1
+        end_col = np.ceil((xpp + xp) / cellres_radar) - 1

-        start_row = np.floor(nrows_radar - ((yp + ypp) / cellres_radar))
-        end_row = np.ceil(nrows_radar - (yp / cellres_radar))
+        start_row = np.floor(nrows_radar - ((yp + ypp) / cellres_radar)) + 1
+        end_row = np.ceil(nrows_radar - (yp / cellres_radar)) + 1

        return int(start_col), int(start_row), int(end_col), int(end_row)

@@ -178,43 +178,26 @@ class GenerateTimeseries:
            zone_name = loc[3]

            if zone_name not in zone_data:
-                zone_data[zone_name] = {"dates": [], "values": {}}
+                zone_data[zone_name] = {"dates": set(), "values": {}}

-            zone_data[zone_name]["values"][zone_id] = results[zone_id]["values"]
-            zone_data[zone_name]["dates"].extend(results[zone_id]["dates"])
+            # Create date -> value map for this grid square
+            raw_dates = results[zone_id]["dates"]
+            raw_values = results[zone_id]["values"]
+            date_value_map = dict(zip(raw_dates, raw_values))

-        # Get unique sorted dates across all zones
-        for zone_name, data in zone_data.items():
-            data["dates"] = sorted(set(data["dates"]))
+            zone_data[zone_name]["values"][zone_id] = date_value_map
+            zone_data[zone_name]["dates"].update(raw_dates)

        # Now write one CSV per zone with aligned timestamps
        for zone_name, data in zone_data.items():
-            dates = data["dates"]
+            sorted_dates = sorted(data["dates"])
            values_dict = data["values"]

            # Create aligned DataFrame
-            df_dict = {"datetime": dates}
-            for grid_square, values in values_dict.items():
-                # Align values to the common dates
-                aligned_values = []
-                value_iter = iter(values)
-                date_iter = iter(dates)
-
-                current_date = next(date_iter, None)
-                current_value = next(value_iter, None)
-
-                for expected_date in dates:
-                    if current_date == expected_date:
-                        aligned_values.append(current_value)
-                        try:
-                            current_date = next(date_iter)
-                            current_value = next(value_iter)
-                        except StopIteration:
-                            current_date = None
-                            current_value = None
-                    else:
-                        aligned_values.append(None)  # Missing value
-
+            df_dict = {"datetime": sorted_dates}
+            for grid_square, dv_map in values_dict.items():
+                # Align values to the common search dates using the map
+                aligned_values = [dv_map.get(d) for d in sorted_dates]
                df_dict[grid_square] = aligned_values

            new_df = pd.DataFrame(df_dict)
@@ -1,7 +1,7 @@
 [project]
 name = "met-office"
-version = "1.3.0"
-description = "Convert .dat nimrod files to .asc files"
+version = "1.3.2"
+description = "Convert nimrod files to .csv timeseries"
 readme = "README.md"
 requires-python = ">=3.14"
 dependencies = [
@@ -4,7 +4,7 @@ requires-python = ">=3.14"

 [[package]]
 name = "met-office"
-version = "1.3.0"
+version = "1.3.2"
 source = { virtual = "." }
 dependencies = [
    { name = "numpy" },
Author	SHA1	Message	Date
Jake	5da185a826	chore: 🧹 Ruff clean up	2025-12-24 15:32:41 +00:00
Jake	1d21ab5f36	fix: 🐞 Fixed an ordering issue when saving to CSV	2025-12-24 15:31:36 +00:00
Jake	0e682aca35	docs: 📜 ReadMe Clarity tweaks	2025-12-17 09:54:46 +00:00
Jake-Pullen	354f4c7fc6	Now deleting existing combined csv files after confirmation at start. (#4 )	2025-12-15 10:17:27 +00:00
Jake-Pullen	a43edb1148	Extraction streamlining (#3 ) * feat: ✨ added the extraction process into the main multi threaded loop Also added a warning when the app finds existing CSV files in the combined folder * fix: 🐛 Fixed time calculations for ETA & Completion	2025-12-12 19:56:14 +00:00