Source code for h2ss.data

  1"""Functions to download and read data."""
  2
  3import glob
  4import os
  5from datetime import datetime, timezone
  6from zipfile import ZipFile
  7
  8import geopandas as gpd
  9import pandas as pd
 10import pooch
 11import rasterio as rio
 12import shapely
 13import xarray as xr
 14from geocube.api.core import make_geocube
 15
 16# CRS of the Kish Basin dat files
 17CRS = 23029
 18
 19
[docs] 20def download_data(url, data_dir, file_name, known_hash=None): 21 """Download data and store it in the specified directory using Pooch. 22 23 Parameters 24 ---------- 25 url : str 26 URL from which the data will be downloaded 27 data_dir : str 28 Directory to store the downloaded data 29 file_name : str 30 Name of the downloaded data file with its extension (not full path) 31 known_hash : str 32 SHA256 hash of downloaded file 33 34 Notes 35 ----- 36 This only downloads data if necessary, i.e. if the data file does not 37 already exist in the directory. 38 """ 39 data_file = os.path.join(data_dir, file_name) 40 if not os.path.isfile(data_file): 41 os.makedirs(data_dir, exist_ok=True) 42 pooch.retrieve( 43 url=url, known_hash=known_hash, fname=file_name, path=data_dir 44 ) 45 print(f"Data downloaded on: {datetime.now(tz=timezone.utc)}") 46 with open(f"{data_file}.txt", "w", encoding="utf-8") as outfile: 47 outfile.write( 48 f"Data downloaded on: {datetime.now(tz=timezone.utc)}\n" 49 f"Download URL: {url}\n" 50 f"SHA256 hash: {pooch.file_hash(data_file)}\n" 51 ) 52 else: 53 print(f"Data '{file_name}' already exists in '{data_dir}'.") 54 with open(f"{data_file}.txt", encoding="utf-8") as f: 55 print(f.read())
56 57
[docs] 58def kish_basin_extent(dat_path): 59 """Read the extent of the Kish Basin data. 60 61 Parameters 62 ---------- 63 dat_path : str 64 Path to the Kish Basin data files 65 66 Returns 67 ------- 68 geopandas.GeoSeries 69 GeoPandas geoseries of the extent polygon 70 """ 71 dat_extent = pd.read_csv( 72 os.path.join(dat_path, "Kish GIS Map Extent - Square.csv"), skiprows=2 73 ) 74 dat_extent = gpd.GeoSeries( 75 shapely.geometry.Polygon( 76 [ 77 (dat_extent[" X"][0], dat_extent[" Y"][0]), 78 (dat_extent[" X"][1], dat_extent[" Y"][1]), 79 (dat_extent[" X"][2], dat_extent[" Y"][2]), 80 (dat_extent[" X"][3], dat_extent[" Y"][3]), 81 ] 82 ), 83 crs=CRS, 84 ) 85 return dat_extent
86 87
[docs] 88def read_dat_file(dat_path): 89 """Read XYZ halite data layers into an Xarray dataset. 90 91 Parameters 92 ---------- 93 dat_path : str 94 Path to the DAT (ASCII XYZ) files 95 96 Returns 97 ------- 98 tuple[xarray.Dataset, geopandas.GeoSeries] 99 Xarray dataset of the XYZ data and GeoPandas geoseries of the extent 100 """ 101 gdf = {} 102 # for dat_file in glob.glob(os.path.join(dat_path, "*.dat")): 103 for dat_file in [ 104 x 105 for x in glob.glob(os.path.join(dat_path, "*.dat")) 106 if "Zone" not in x 107 ]: 108 # read each layer as individual dataframes into a dictionary 109 gdf[os.path.split(dat_file)[-1][:-4]] = pd.read_fwf( 110 dat_file, header=None, names=["X", "Y", "Z"] 111 ) 112 113 # assign layer name to a column 114 gdf[os.path.split(dat_file)[-1][:-4]]["data"] = os.path.split( 115 dat_file 116 )[-1][:-4] 117 118 # # find data resolution 119 # gdf_xr = gdf[list(gdf.keys())[0]].set_index(["X", "Y"]).to_xarray() 120 # resx = gdf_xr["X"][1] - gdf_xr["X"][0] 121 # resy = gdf_xr["Y"][1] - gdf_xr["Y"][0] 122 123 # combine dataframes 124 gdf = pd.concat(gdf.values()) 125 126 # convert dataframe to geodataframe 127 gdf["geometry"] = gpd.points_from_xy(gdf.X, gdf.Y, crs=CRS) 128 gdf = gpd.GeoDataFrame(gdf) 129 gdf.drop(columns=["X", "Y"], inplace=True) 130 131 # convert to Xarray dataset 132 xds = make_geocube( 133 vector_data=gdf, 134 # resolution=(-abs(resy), abs(resx)), 135 # align=(abs(resy / 2), abs(resx / 2)), 136 resolution=(-200, 200), 137 align=(100, 100), 138 group_by="data", 139 ) 140 141 # split variables and halite members 142 xds_ = {} 143 for d in xds["data"].values: 144 halite_member = d.split(" ")[0] 145 # fix halite names 146 if halite_member == "Presall": 147 halite_member = "Preesall" 148 elif halite_member == "Flyde": 149 halite_member = "Fylde" 150 # unit = d.split(" ")[-1] 151 zvar = d.split("Halite ")[-1].split(" XYZ")[0] 152 xds_[d] = ( 153 xds.sel(data=d) 154 .assign_coords(halite=halite_member) 155 .expand_dims(dim="halite") 156 .drop_vars("data") 157 ) 158 xds_[d] = xds_[d].rename({"Z": zvar.replace(" ", "")}) 159 # xds_[d][zvar.replace(" ", "")] = xds_[d][ 160 # zvar.replace(" ", "") 161 # ].assign_attrs(units=unit, long_name=zvar) 162 163 xds = xr.combine_by_coords(xds_.values(), combine_attrs="override") 164 165 # # keep only points corresponding to zones of interest in the dataframe 166 # zones = gdf.loc[gdf["data"].str.contains("Zone")] 167 168 # # create zones of interest polygon 169 # zones = gpd.GeoDataFrame(geometry=zones.buffer(100).envelope).dissolve() 170 171 # create extent polygon 172 dat_extent = kish_basin_extent(dat_path=dat_path) 173 174 return xds, dat_extent # zones
175 176
[docs] 177def kish_basin_data_depth_adjusted(dat_path, bathymetry_path): 178 """Read halite data with adjusted depths and its extent. 179 180 Parameters 181 ---------- 182 dat_path : str 183 Path to the DAT (ASCII XYZ) files 184 bathymetry_path : str 185 Path to the bathymetry netCDF file 186 187 Returns 188 ------- 189 tuple[xarray.Dataset, geopandas.GeoSeries] 190 Xarray dataset of the halite data and GeoPandas geoseries of the extent 191 192 Notes 193 ----- 194 A hacky workaround was used to prevent multiple grid mappings in the 195 resulting Xarray dataset by first assigning a variable of the desired 196 mapping and then multiplying it by zero prior to involving variables which 197 originally had a different mapping. 198 """ 199 bath = xr.open_dataset( 200 os.path.join(bathymetry_path, "D4_2022.nc"), decode_coords="all" 201 ) 202 bath = bath.chunk({"lat": 1000, "lon": 1000, "cdi_index_count": 1000}) 203 xds, dat_extent = read_dat_file(dat_path=dat_path) 204 bath = bath.rename({"lon": "x", "lat": "y"}).rio.reproject_match( 205 xds, resampling=rio.enums.Resampling.bilinear 206 ) 207 xds = xds.assign(TopDepthSeabed=xds["TopDepth"] + bath["elevation"]) 208 xds = xds.assign(BaseDepthSeabed=xds["BaseDepth"] + bath["elevation"]) 209 # hacky way to prevent multiple grid mappings 210 xds = xds.assign(Bathymetry=xds["TopDepth"] * 0 + bath["elevation"]) 211 # for v in ["TopDepth", "BaseDepth"]: 212 # xds[f"{v}Seabed"].attrs = xds[v].attrs 213 # xds[f"{v}Seabed"].attrs["long_name"] = ( 214 # xds[f"{v}Seabed"].attrs["long_name"] + " relative to the " 215 # "sea-floor" 216 # ) 217 # xds["Bathymetry"].attrs = bath["elevation"].attrs 218 return xds, dat_extent
219 220
[docs] 221def bathymetry_layer(dat_extent, bathymetry_path): 222 """Bathymetry layer for plotting. 223 224 Parameters 225 ---------- 226 dat_extent : geopandas.GeoSeries 227 Extent of the data 228 bathymetry_path : str 229 Path to the bathymetry netCDF file 230 231 Returns 232 ------- 233 xarray.Dataset 234 Xarray dataset of the halite data 235 """ 236 bath = xr.open_dataset( 237 os.path.join(bathymetry_path, "D4_2022.nc"), decode_coords="all" 238 ) 239 bath = bath.chunk({"lat": 1000, "lon": 1000, "cdi_index_count": 1000}) 240 bath = bath.rio.reproject(CRS).rio.clip(dat_extent.to_crs(CRS)) 241 return bath
242 243
[docs] 244def read_shapefile_from_zip(data_path, endswith=".shp"): 245 """Read the Shapefile layer from a Zip file. 246 247 Parameters 248 ---------- 249 data_path : str 250 Path to the exploration well Zip file 251 endswith : str 252 What the Shapefile's filename ends with 253 254 Returns 255 ------- 256 geopandas.GeoDataFrame 257 Geodataframe of the Shapefile's data 258 """ 259 data_shp = gpd.read_file( 260 os.path.join( 261 f"zip://{data_path}!" 262 + [ 263 x 264 for x in ZipFile(data_path).namelist() 265 if x.endswith(endswith) 266 ][0] 267 ) 268 ) 269 return data_shp
270 271
[docs] 272def halite_shape(dat_xr, halite=None): 273 """Create a vector shape of the halite data. 274 275 Parameters 276 ---------- 277 dat_xr : xarray.Dataset 278 Xarray dataset of the halite data 279 halite : str 280 Halite member 281 282 Returns 283 ------- 284 geopandas.GeoDataFrame 285 A (multi)polygon geodataframe of the halite's shape 286 """ 287 if halite: 288 shape = ( 289 dat_xr.sel(halite=halite)["Thickness"] 290 .to_dataframe() 291 .dropna() 292 .reset_index() 293 ) 294 else: 295 shape = ( 296 dat_xr.max(dim="halite")["Thickness"] 297 .to_dataframe() 298 .dropna() 299 .reset_index() 300 ) 301 shape = gpd.GeoDataFrame( 302 geometry=gpd.GeoSeries(gpd.points_from_xy(shape.x, shape.y)) 303 .buffer(100) 304 .envelope, 305 crs=CRS, 306 ).dissolve() 307 return shape