.

uceshaf · uceshaf · commit cfa817a19834 · 2025-05-15T10:29:50.000+05:00
diff --git a/configs/grid_index_config.yaml b/configs/grid_index_config.yaml
@@ -0,0 +1,25 @@
+# Grid generation configuration for HNestEOGrid
+
+# Define all resolution levels in meters
+levels: [120000, 12000, 2400, 1200, 600, 300]
+default_levels: [300, 600, 1200, 2400, 12000, 120000]
+
+# Leave utm_zones undefined to process ALL zones (1N–60N, 1S–60S)
+# utm_zones: [f"{i}{d}" for i in range(1, 61) for d in ["N", "S"]]
+
+# Output settings
+output_format: "PARQUET"                    # Format: PARQUET or SHP
+output_dir: "D:/nesteo_hf/index_structure"  # Where to write output
+
+# Optional enhancements
+# save_geohash: true                          # Save GeoHash column
+include_polar: true                        # If True, generates polar grids
+# skip_existing: true                         # Skip grid generation if output already exists
+# save_wgs_files: true                        # Export WGS84 versions alongside UTM
+# save_single_file: true                      # Save full grid as one .parquet per level
+generate: false                              # Flag to trigger actual generation (safe for dry-run toggle)
+
+# Optional advanced
+# chunked_levels: [300]                       # For large levels, save in multiple chunks
+# ref_level: 12000                          # Reference level for spatial alignment
+# ref_dir: "D:/NestEO_hf/metadata_current/grids_geo/grid_12000m"
diff --git a/scripts/generate_grid_index.py b/scripts/generate_grid_index.py
@@ -0,0 +1,50 @@
+from pathlib import Path
+from NestEO.grid import NestEOGrid
+import yaml
+
+def load_config(config_path: Path) -> dict:
+    with open(config_path, "r") as f:
+        return yaml.safe_load(f)
+
+
+def main(config_file="generate_grid_index.yaml"):
+    config_path = Path(config_file)
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config file not found: {config_path.resolve()}")
+
+    config = load_config(config_path)
+    # main_path = config.get("main_path", "D:/NestEO_hf/")
+    output_dir=config.get("output_dir")#+"grids"
+    # ref_dir = main_path+"datasets_AUX/Landcover/ESA_WorldCover/ESA_LC_proportions/600m"
+
+    # Dynamically pass all supported parameters from config to the class
+    grid = NestEOGrid(
+        levels=config.get("levels", [120000, 12000, 2400, 1200,]),
+        default_levels=config.get("default_levels"),
+        # buffer_ratio=config.get("buffer_ratio", 0.0),
+        # overlap_ratio=config.get("overlap_ratio", 0.0),
+        # utm_zones=config.get("utm_zones"),  # Optional: all zones if None
+        # latlon_bounds=config.get("latlon_bounds"),
+        include_polar=config.get("include_polar", True),
+        # save_geohash=config.get("save_geohash", False),
+        output_dir=output_dir,
+        # output_format=config.get("output_format", "PARQUET"),
+        # save_single_file=config.get("save_single_file", True),
+        # save_wgs_files=config.get("save_wgs_files", True),
+        # row_group_size=config.get("row_group_size", 10000),
+        # file_name_prefix=config.get("file_name_prefix", ""),
+        # chunked_levels=config.get("chunked_levels", [300, 600]),
+        # partition_count=config.get("partition_count", 8),
+        # skip_existing=config.get("skip_existing", True),
+        # ref_level=config.get("ref_level"),
+        # ref_dir=ref_dir,  #config.get("ref_dir", ""),
+        generate=config.get("generate", False),
+    )
+    grid.build_tile_index_parquet(Path(output_dir) / "grid_index.parquet")
+    # grid.run()
+
+
+if __name__ == "__main__":
+    import sys
+    config_arg = sys.argv[1] if len(sys.argv) > 1 else "grid_config.yaml"
+    main(config_file=config_arg)
diff --git a/src/NestEO/grid/grid_generator.py b/src/NestEO/grid/grid_generator.py
@@ -515,27 +515,8 @@ def _zero_tile_tuples(self, zone: str) -> set[tuple[int,int]]:
         self._zero_cache[zone] = tuples
         return tuples
 
-
-        # pat = join(
-        #     self.ref_dir,
-        #     f"lc_proportions_*_{zone}_{self.ref_level}m.parquet")
-        # files = glob.glob(pat)
-        # if not files:
-        #     raise FileNotFoundError(f"No ref‑level parquet for zone {zone} under {pat}")
-
-        # df = pd.read_parquet(files[0], columns=["tile_id", "landcover_props"])
-        # df = df[df["landcover_props"] == "{0: 1.0}"]
-
-        # # parse tile_id → (x_idx, y_idx)   ‑‑ vectorised regex
-        # rgx = re.compile(r"_X(\d+)_Y(\d+)")
-        # tuples = set(
-        #     df["tile_id"].str.extract(rgx).astype(int).apply(tuple, axis=1)
-        # )
-        # self._zero_cache[zone] = tuples
-        # return tuples
     # ─────────────────────────────────────────────────────────────────────────────
 
-
     # def ancestor_id_series(self, tile_id_series: pd.Series) -> pd.Series:
     #     """
     #     Vectorised: return the tile_id of the ancestor at *ref_level*
@@ -559,19 +540,6 @@ def _zero_tile_tuples(self, zone: str) -> set[tuple[int,int]]:
     #         df["zone"] + "_X" + x_anc.astype(str).str.zfill(6) +
     #         "_Y" + y_anc.astype(str).str.zfill(6)
     #     )
-
-    # def _prefilter_grid_centroids(self, cols, rows, grid_size, crs, lon_bounds: Tuple[float, float]):
-    #     grid_x, grid_y = np.meshgrid(cols, rows)
-    #     grid_x = grid_x.ravel()
-    #     grid_y = grid_y.ravel()
-    #     print("Grid X and Y shape: ", grid_x.shape, grid_y.shape)
-    #     cx = grid_x + grid_size / 2
-    #     cy = grid_y + grid_size / 2
-    #     transformer = Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
-    #     lons, _ = transformer.transform(cx, cy)
-    #     lon_min, lon_max = lon_bounds
-    #     mask = (lons >= lon_min) & (lons <= lon_max)
-    #     return grid_x[mask], grid_y[mask]
     
     def _prefilter_grid_centroids(self, cols, rows, grid_size, crs,
                                   lon_bounds: Tuple[float, float]):
@@ -749,6 +717,155 @@ def _construct_tile_file_path(self, zone: str, level: int, ext: Optional[str] =
         return join(self.output_dir, fname)
 
 
+    # ──────────────────────────────────────────────────────────────
+    def _iter_valid_xy(self, grid_size: int, zone: str):
+        """
+        Yield (x_idx, y_idx) integers for every tile that would exist at
+        *grid_size* and *zone* without creating any geometry.  Internal helper
+        for fast index generation.
+        """
+        if zone in ("NP", "SP"):                       # Polar
+            EPSG = 3413 if zone == "NP" else 3031
+            crs  = CRS.from_epsg(EPSG)
+            bounds = (-4_500_000, 0) if zone == "NP" else (-4_500_000, -4_500_000)
+            origin_x, origin_y = bounds
+            xmax, ymax = origin_x + 9_000_000, origin_y + 4_500_000
+            step  = int(grid_size * (1 - self.overlap_ratio)) if self.overlap_ratio > 0 else grid_size
+            cols  = np.arange(origin_x, xmax, step)
+            rows  = np.arange(origin_y, ymax, step)
+
+            transformer = Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
+            grid_x, grid_y = np.meshgrid(cols, rows)
+            cx = grid_x.ravel() + grid_size / 2
+            cy = grid_y.ravel() + grid_size / 2
+            _, lats = transformer.transform(cx, cy)
+            mask = (lats >= 84) if zone == "NP" else (lats <= -80)
+            x_idx = ((grid_x.ravel()[mask] - origin_x) // grid_size).astype(int)
+            y_idx = ((grid_y.ravel()[mask] - origin_y) // grid_size).astype(int)
+            return x_idx, y_idx
+
+        # ─── UTM ──────────────────────────────────────────────────
+        zone_num = int(zone[:-1]); hemi = zone[-1]
+        epsg = 32600 + zone_num if hemi == "N" else 32700 + zone_num
+        crs  = CRS.from_epsg(epsg)
+
+        origin_x = 100_000
+        origin_y = 0 if hemi == "N" else 10_000_000
+        xmin, xmax = origin_x, 900_000
+        ymin, ymax = (0, 9_329_005) if hemi == "N" else (0, origin_y)
+
+        step  = int(grid_size * (1 - self.overlap_ratio)) if self.overlap_ratio > 0 else grid_size
+        cols  = np.arange(xmin, xmax, step)
+        rows  = np.arange(ymin, ymax, step)
+
+        # Fast centroid-based lon–lat filter (reuse existing logic)
+        valid_x, valid_y = self._prefilter_grid_centroids(
+            cols, rows, grid_size, crs,
+            ((zone_num - 1) * 6 - 180, zone_num * 6 - 180)
+        )
+        x_idx = ((valid_x - origin_x) // grid_size).astype(int)
+        y_idx = ((valid_y - origin_y) // grid_size).astype(int)
+        keep  = self._mask_by_ref(grid_size, zone, x_idx, y_idx)
+        return x_idx[keep], y_idx[keep]
+
+    def build_tile_index_parquet(
+        self,
+        output_path: str = "grid_index.parquet",
+        row_group_target: int | None = None,
+    ) -> None:
+        """
+        Create a single Parquet file containing *all* tile_id / super_id pairs
+        for every configured level and every UTM + optional polar zone—without
+        generating geometries.
+
+        Parameters
+        ----------
+        output_path : str, default "grid_index.parquet"
+            Destination path.
+        row_group_target : int | None
+            Desired row-group size.  If None, it is set to
+            max(total_rows // 512, 1024).
+        """
+        import pyarrow as pa, pyarrow.parquet as pq
+
+        # ------------------------------------------------------------------ zones
+        zones = [f"{i}{h}" for i in range(1, 61) for h in "NS"]
+        if getattr(self, "include_polar", False):
+            zones += ["NP", "SP"]
+
+        # ---------------------------------------------------------------- pass 1
+        total_rows = 0
+        for level in self.levels:
+            print("Processing Level: ", level)
+            for zone in zones:
+                x_idx, y_idx = self._iter_valid_xy(level, zone)
+                total_rows += x_idx.size
+
+        if total_rows == 0:
+            raise RuntimeError("No tiles found with current configuration.")
+
+        if row_group_target is None:
+            row_group_target = max(total_rows // 1024, 1024)
+
+        # -------------------------------------------------------------- writer
+        schema = pa.schema(
+            [("tile_id", pa.string()), ("super_id", pa.string())]
+        )
+        writer = pq.ParquetWriter(
+            output_path, schema, version="2.6", compression="snappy"
+        )
+
+        # --------------------------------------------------------------- pass 2
+        buffer_tile, buffer_super = [], []
+        buffer_cap = row_group_target
+
+        def _flush():
+            nonlocal buffer_tile, buffer_super
+            if buffer_tile:
+                table = pa.table(
+                    {"tile_id": pa.array(buffer_tile), "super_id": pa.array(buffer_super)}
+                )
+                writer.write_table(table, row_group_size=row_group_target)
+                buffer_tile, buffer_super = [], []
+
+        for level in self.levels:
+            for zone in zones:
+                x_idx, y_idx = self._iter_valid_xy(level, zone)
+                if x_idx.size == 0:
+                    continue
+
+                tiles = [
+                    self._make_tile_id(level, zone, xi, yi)
+                    for xi, yi in zip(x_idx, y_idx)
+                ]
+                supers = [
+                    self._compute_super_id(level, zone, xi, yi)
+                    for xi, yi in zip(x_idx, y_idx)
+                ]
+
+                buffer_tile.extend(tiles)
+                buffer_super.extend(supers)
+
+                # Flush whenever the buffer reaches the cap to respect row_group_target
+                while len(buffer_tile) >= buffer_cap:
+                    slice_end = buffer_cap
+                    table = pa.table(
+                        {
+                            "tile_id": pa.array(buffer_tile[:slice_end]),
+                            "super_id": pa.array(buffer_super[:slice_end]),
+                        }
+                    )
+                    writer.write_table(table, row_group_size=row_group_target)
+                    buffer_tile = buffer_tile[slice_end:]
+                    buffer_super = buffer_super[slice_end:]
+
+        _flush()  # write any remainder
+        writer.close()
+        print(f"[OK] grid index written → {output_path}  "
+            f"({total_rows:,} rows, row_group {row_group_target})")
+
+
+
     def check_satellite_resolution_compatibility(self, grid_sizes: List[int], satellite_resolutions: List[int]) -> pd.DataFrame:
         """
         Computes how well each tile level intersects with a set of satellite resolutions.