Source code for h2ss.data

  1"""Functions to download and read data.
  2
  3"""
  4
  5import glob
  6import os
  7from datetime import datetime, timezone
  8from zipfile import ZipFile
  9
 10import geopandas as gpd
 11import pandas as pd
 12import pooch
 13import rasterio as rio
 14import shapely
 15import xarray as xr
 16from geocube.api.core import make_geocube
 17
 18# CRS of the Kish Basin dat files
 19CRS = 23029
 20
 21
[docs] 22def download_data(url, data_dir, file_name, known_hash=None): 23 """Download data and store it in the specified directory using Pooch. 24 25 Parameters 26 ---------- 27 url : str 28 URL from which the data will be downloaded 29 data_dir : str 30 Directory to store the downloaded data 31 file_name : str 32 Name of the downloaded data file with its extension (not full path) 33 known_hash : str 34 SHA256 hash of downloaded file 35 36 Notes 37 ----- 38 This only downloads data if necessary, i.e. if the data file does not 39 already exist in the directory. 40 """ 41 data_file = os.path.join(data_dir, file_name) 42 if not os.path.isfile(data_file): 43 os.makedirs(data_dir, exist_ok=True) 44 pooch.retrieve( 45 url=url, known_hash=known_hash, fname=file_name, path=data_dir 46 ) 47 print(f"Data downloaded on: {datetime.now(tz=timezone.utc)}") 48 with open(f"{data_file}.txt", "w", encoding="utf-8") as outfile: 49 outfile.write( 50 f"Data downloaded on: {datetime.now(tz=timezone.utc)}\n" 51 f"Download URL: {url}\n" 52 f"SHA256 hash: {pooch.file_hash(data_file)}\n" 53 ) 54 else: 55 print(f"Data '{file_name}' already exists in '{data_dir}'.") 56 with open(f"{data_file}.txt", encoding="utf-8") as f: 57 print(f.read())
58 59
[docs] 60def kish_basin_extent(dat_path): 61 """Read the extent of the Kish Basin data. 62 63 Parameters 64 ---------- 65 dat_path : str 66 Path to the Kish Basin data files 67 68 Returns 69 ------- 70 geopandas.GeoSeries 71 GeoPandas geoseries of the extent polygon 72 """ 73 dat_extent = pd.read_csv( 74 os.path.join(dat_path, "Kish GIS Map Extent - Square.csv"), skiprows=2 75 ) 76 dat_extent = gpd.GeoSeries( 77 shapely.geometry.Polygon( 78 [ 79 (dat_extent[" X"][0], dat_extent[" Y"][0]), 80 (dat_extent[" X"][1], dat_extent[" Y"][1]), 81 (dat_extent[" X"][2], dat_extent[" Y"][2]), 82 (dat_extent[" X"][3], dat_extent[" Y"][3]), 83 ] 84 ), 85 crs=CRS, 86 ) 87 return dat_extent
88 89
[docs] 90def read_dat_file(dat_path): 91 """Read XYZ halite data layers into an Xarray dataset. 92 93 Parameters 94 ---------- 95 dat_path : str 96 Path to the DAT (ASCII XYZ) files 97 98 Returns 99 ------- 100 tuple[xarray.Dataset, geopandas.GeoSeries] 101 Xarray dataset of the XYZ data and GeoPandas geoseries of the extent 102 """ 103 gdf = {} 104 # for dat_file in glob.glob(os.path.join(dat_path, "*.dat")): 105 for dat_file in [ 106 x 107 for x in glob.glob(os.path.join(dat_path, "*.dat")) 108 if "Zone" not in x 109 ]: 110 # read each layer as individual dataframes into a dictionary 111 gdf[os.path.split(dat_file)[-1][:-4]] = pd.read_fwf( 112 dat_file, header=None, names=["X", "Y", "Z"] 113 ) 114 115 # assign layer name to a column 116 gdf[os.path.split(dat_file)[-1][:-4]]["data"] = os.path.split( 117 dat_file 118 )[-1][:-4] 119 120 # # find data resolution 121 # gdf_xr = gdf[list(gdf.keys())[0]].set_index(["X", "Y"]).to_xarray() 122 # resx = gdf_xr["X"][1] - gdf_xr["X"][0] 123 # resy = gdf_xr["Y"][1] - gdf_xr["Y"][0] 124 125 # combine dataframes 126 gdf = pd.concat(gdf.values()) 127 128 # convert dataframe to geodataframe 129 gdf["geometry"] = gpd.points_from_xy(gdf.X, gdf.Y, crs=CRS) 130 gdf = gpd.GeoDataFrame(gdf) 131 gdf.drop(columns=["X", "Y"], inplace=True) 132 133 # convert to Xarray dataset 134 xds = make_geocube( 135 vector_data=gdf, 136 # resolution=(-abs(resy), abs(resx)), 137 # align=(abs(resy / 2), abs(resx / 2)), 138 resolution=(-200, 200), 139 align=(100, 100), 140 group_by="data", 141 ) 142 143 # split variables and halite members 144 xds_ = {} 145 for d in xds["data"].values: 146 halite_member = d.split(" ")[0] 147 # fix halite names 148 if halite_member == "Presall": 149 halite_member = "Preesall" 150 elif halite_member == "Flyde": 151 halite_member = "Fylde" 152 # unit = d.split(" ")[-1] 153 zvar = d.split("Halite ")[-1].split(" XYZ")[0] 154 xds_[d] = ( 155 xds.sel(data=d) 156 .assign_coords(halite=halite_member) 157 .expand_dims(dim="halite") 158 .drop_vars("data") 159 ) 160 xds_[d] = xds_[d].rename({"Z": zvar.replace(" ", "")}) 161 # xds_[d][zvar.replace(" ", "")] = xds_[d][ 162 # zvar.replace(" ", "") 163 # ].assign_attrs(units=unit, long_name=zvar) 164 165 xds = xr.combine_by_coords(xds_.values(), combine_attrs="override") 166 167 # # keep only points corresponding to zones of interest in the dataframe 168 # zones = gdf.loc[gdf["data"].str.contains("Zone")] 169 170 # # create zones of interest polygon 171 # zones = gpd.GeoDataFrame(geometry=zones.buffer(100).envelope).dissolve() 172 173 # create extent polygon 174 dat_extent = kish_basin_extent(dat_path=dat_path) 175 176 return xds, dat_extent # zones
177 178
[docs] 179def kish_basin_data_depth_adjusted(dat_path, bathymetry_path): 180 """Read halite data with adjusted depths and its extent. 181 182 Parameters 183 ---------- 184 dat_path : str 185 Path to the DAT (ASCII XYZ) files 186 bathymetry_path : str 187 Path to the bathymetry netCDF file 188 189 Returns 190 ------- 191 tuple[xarray.Dataset, geopandas.GeoSeries] 192 Xarray dataset of the halite data and GeoPandas geoseries of the extent 193 194 Notes 195 ----- 196 A hacky workaround was used to prevent multiple grid mappings in the 197 resulting Xarray dataset by first assigning a variable of the desired 198 mapping and then multiplying it by zero prior to involving variables which 199 originally had a different mapping. 200 """ 201 bath = xr.open_dataset( 202 os.path.join(bathymetry_path, "D4_2022.nc"), decode_coords="all" 203 ) 204 bath = bath.chunk({"lat": 1000, "lon": 1000, "cdi_index_count": 1000}) 205 xds, dat_extent = read_dat_file(dat_path=dat_path) 206 bath = bath.rename({"lon": "x", "lat": "y"}).rio.reproject_match( 207 xds, resampling=rio.enums.Resampling.bilinear 208 ) 209 xds = xds.assign(TopDepthSeabed=xds["TopDepth"] + bath["elevation"]) 210 xds = xds.assign(BaseDepthSeabed=xds["BaseDepth"] + bath["elevation"]) 211 # hacky way to prevent multiple grid mappings 212 xds = xds.assign(Bathymetry=xds["TopDepth"] * 0 + bath["elevation"]) 213 # for v in ["TopDepth", "BaseDepth"]: 214 # xds[f"{v}Seabed"].attrs = xds[v].attrs 215 # xds[f"{v}Seabed"].attrs["long_name"] = ( 216 # xds[f"{v}Seabed"].attrs["long_name"] + " relative to the " 217 # "sea-floor" 218 # ) 219 # xds["Bathymetry"].attrs = bath["elevation"].attrs 220 return xds, dat_extent
221 222
[docs] 223def bathymetry_layer(dat_extent, bathymetry_path): 224 """Bathymetry layer for plotting. 225 226 Parameters 227 ---------- 228 dat_extent : geopandas.GeoSeries 229 Extent of the data 230 bathymetry_path : str 231 Path to the bathymetry netCDF file 232 233 Returns 234 ------- 235 xarray.Dataset 236 Xarray dataset of the halite data 237 """ 238 bath = xr.open_dataset( 239 os.path.join(bathymetry_path, "D4_2022.nc"), decode_coords="all" 240 ) 241 bath = bath.chunk({"lat": 1000, "lon": 1000, "cdi_index_count": 1000}) 242 bath = bath.rio.reproject(CRS).rio.clip(dat_extent.to_crs(CRS)) 243 return bath
244 245
[docs] 246def read_shapefile_from_zip(data_path, endswith=".shp"): 247 """Read the Shapefile layer from a Zip file. 248 249 Parameters 250 ---------- 251 data_path : str 252 Path to the exploration well Zip file 253 endswith : str 254 What the Shapefile's filename ends with 255 256 Returns 257 ------- 258 geopandas.GeoDataFrame 259 Geodataframe of the Shapefile's data 260 """ 261 data_shp = gpd.read_file( 262 os.path.join( 263 f"zip://{data_path}!" 264 + [ 265 x 266 for x in ZipFile(data_path).namelist() 267 if x.endswith(endswith) 268 ][0] 269 ) 270 ) 271 return data_shp
272 273
[docs] 274def halite_shape(dat_xr, halite=None): 275 """Create a vector shape of the halite data. 276 277 Parameters 278 ---------- 279 dat_xr : xarray.Dataset 280 Xarray dataset of the halite data 281 halite : str 282 Halite member 283 284 Returns 285 ------- 286 geopandas.GeoDataFrame 287 A (multi)polygon geodataframe of the halite's shape 288 """ 289 if halite: 290 shape = ( 291 dat_xr.sel(halite=halite)["Thickness"] 292 .to_dataframe() 293 .dropna() 294 .reset_index() 295 ) 296 else: 297 shape = ( 298 dat_xr.max(dim="halite")["Thickness"] 299 .to_dataframe() 300 .dropna() 301 .reset_index() 302 ) 303 shape = gpd.GeoDataFrame( 304 geometry=gpd.GeoSeries(gpd.points_from_xy(shape.x, shape.y)) 305 .buffer(100) 306 .envelope, 307 crs=CRS, 308 ).dissolve() 309 return shape