1"""Functions to download and read data.
2
3"""
4
5import glob
6import os
7from datetime import datetime, timezone
8from zipfile import ZipFile
9
10import geopandas as gpd
11import pandas as pd
12import pooch
13import rasterio as rio
14import shapely
15import xarray as xr
16from geocube.api.core import make_geocube
17
18# CRS of the Kish Basin dat files
19CRS = 23029
20
21
[docs]
22def download_data(url, data_dir, file_name, known_hash=None):
23 """Download data and store it in the specified directory using Pooch.
24
25 Parameters
26 ----------
27 url : str
28 URL from which the data will be downloaded
29 data_dir : str
30 Directory to store the downloaded data
31 file_name : str
32 Name of the downloaded data file with its extension (not full path)
33 known_hash : str
34 SHA256 hash of downloaded file
35
36 Notes
37 -----
38 This only downloads data if necessary, i.e. if the data file does not
39 already exist in the directory.
40 """
41 data_file = os.path.join(data_dir, file_name)
42 if not os.path.isfile(data_file):
43 os.makedirs(data_dir, exist_ok=True)
44 pooch.retrieve(
45 url=url, known_hash=known_hash, fname=file_name, path=data_dir
46 )
47 print(f"Data downloaded on: {datetime.now(tz=timezone.utc)}")
48 with open(f"{data_file}.txt", "w", encoding="utf-8") as outfile:
49 outfile.write(
50 f"Data downloaded on: {datetime.now(tz=timezone.utc)}\n"
51 f"Download URL: {url}\n"
52 f"SHA256 hash: {pooch.file_hash(data_file)}\n"
53 )
54 else:
55 print(f"Data '{file_name}' already exists in '{data_dir}'.")
56 with open(f"{data_file}.txt", encoding="utf-8") as f:
57 print(f.read())
58
59
[docs]
60def kish_basin_extent(dat_path):
61 """Read the extent of the Kish Basin data.
62
63 Parameters
64 ----------
65 dat_path : str
66 Path to the Kish Basin data files
67
68 Returns
69 -------
70 geopandas.GeoSeries
71 GeoPandas geoseries of the extent polygon
72 """
73 dat_extent = pd.read_csv(
74 os.path.join(dat_path, "Kish GIS Map Extent - Square.csv"), skiprows=2
75 )
76 dat_extent = gpd.GeoSeries(
77 shapely.geometry.Polygon(
78 [
79 (dat_extent[" X"][0], dat_extent[" Y"][0]),
80 (dat_extent[" X"][1], dat_extent[" Y"][1]),
81 (dat_extent[" X"][2], dat_extent[" Y"][2]),
82 (dat_extent[" X"][3], dat_extent[" Y"][3]),
83 ]
84 ),
85 crs=CRS,
86 )
87 return dat_extent
88
89
[docs]
90def read_dat_file(dat_path):
91 """Read XYZ halite data layers into an Xarray dataset.
92
93 Parameters
94 ----------
95 dat_path : str
96 Path to the DAT (ASCII XYZ) files
97
98 Returns
99 -------
100 tuple[xarray.Dataset, geopandas.GeoSeries]
101 Xarray dataset of the XYZ data and GeoPandas geoseries of the extent
102 """
103 gdf = {}
104 # for dat_file in glob.glob(os.path.join(dat_path, "*.dat")):
105 for dat_file in [
106 x
107 for x in glob.glob(os.path.join(dat_path, "*.dat"))
108 if "Zone" not in x
109 ]:
110 # read each layer as individual dataframes into a dictionary
111 gdf[os.path.split(dat_file)[-1][:-4]] = pd.read_fwf(
112 dat_file, header=None, names=["X", "Y", "Z"]
113 )
114
115 # assign layer name to a column
116 gdf[os.path.split(dat_file)[-1][:-4]]["data"] = os.path.split(
117 dat_file
118 )[-1][:-4]
119
120 # # find data resolution
121 # gdf_xr = gdf[list(gdf.keys())[0]].set_index(["X", "Y"]).to_xarray()
122 # resx = gdf_xr["X"][1] - gdf_xr["X"][0]
123 # resy = gdf_xr["Y"][1] - gdf_xr["Y"][0]
124
125 # combine dataframes
126 gdf = pd.concat(gdf.values())
127
128 # convert dataframe to geodataframe
129 gdf["geometry"] = gpd.points_from_xy(gdf.X, gdf.Y, crs=CRS)
130 gdf = gpd.GeoDataFrame(gdf)
131 gdf.drop(columns=["X", "Y"], inplace=True)
132
133 # convert to Xarray dataset
134 xds = make_geocube(
135 vector_data=gdf,
136 # resolution=(-abs(resy), abs(resx)),
137 # align=(abs(resy / 2), abs(resx / 2)),
138 resolution=(-200, 200),
139 align=(100, 100),
140 group_by="data",
141 )
142
143 # split variables and halite members
144 xds_ = {}
145 for d in xds["data"].values:
146 halite_member = d.split(" ")[0]
147 # fix halite names
148 if halite_member == "Presall":
149 halite_member = "Preesall"
150 elif halite_member == "Flyde":
151 halite_member = "Fylde"
152 # unit = d.split(" ")[-1]
153 zvar = d.split("Halite ")[-1].split(" XYZ")[0]
154 xds_[d] = (
155 xds.sel(data=d)
156 .assign_coords(halite=halite_member)
157 .expand_dims(dim="halite")
158 .drop_vars("data")
159 )
160 xds_[d] = xds_[d].rename({"Z": zvar.replace(" ", "")})
161 # xds_[d][zvar.replace(" ", "")] = xds_[d][
162 # zvar.replace(" ", "")
163 # ].assign_attrs(units=unit, long_name=zvar)
164
165 xds = xr.combine_by_coords(xds_.values(), combine_attrs="override")
166
167 # # keep only points corresponding to zones of interest in the dataframe
168 # zones = gdf.loc[gdf["data"].str.contains("Zone")]
169
170 # # create zones of interest polygon
171 # zones = gpd.GeoDataFrame(geometry=zones.buffer(100).envelope).dissolve()
172
173 # create extent polygon
174 dat_extent = kish_basin_extent(dat_path=dat_path)
175
176 return xds, dat_extent # zones
177
178
[docs]
179def kish_basin_data_depth_adjusted(dat_path, bathymetry_path):
180 """Read halite data with adjusted depths and its extent.
181
182 Parameters
183 ----------
184 dat_path : str
185 Path to the DAT (ASCII XYZ) files
186 bathymetry_path : str
187 Path to the bathymetry netCDF file
188
189 Returns
190 -------
191 tuple[xarray.Dataset, geopandas.GeoSeries]
192 Xarray dataset of the halite data and GeoPandas geoseries of the extent
193
194 Notes
195 -----
196 A hacky workaround was used to prevent multiple grid mappings in the
197 resulting Xarray dataset by first assigning a variable of the desired
198 mapping and then multiplying it by zero prior to involving variables which
199 originally had a different mapping.
200 """
201 bath = xr.open_dataset(
202 os.path.join(bathymetry_path, "D4_2022.nc"), decode_coords="all"
203 )
204 bath = bath.chunk({"lat": 1000, "lon": 1000, "cdi_index_count": 1000})
205 xds, dat_extent = read_dat_file(dat_path=dat_path)
206 bath = bath.rename({"lon": "x", "lat": "y"}).rio.reproject_match(
207 xds, resampling=rio.enums.Resampling.bilinear
208 )
209 xds = xds.assign(TopDepthSeabed=xds["TopDepth"] + bath["elevation"])
210 xds = xds.assign(BaseDepthSeabed=xds["BaseDepth"] + bath["elevation"])
211 # hacky way to prevent multiple grid mappings
212 xds = xds.assign(Bathymetry=xds["TopDepth"] * 0 + bath["elevation"])
213 # for v in ["TopDepth", "BaseDepth"]:
214 # xds[f"{v}Seabed"].attrs = xds[v].attrs
215 # xds[f"{v}Seabed"].attrs["long_name"] = (
216 # xds[f"{v}Seabed"].attrs["long_name"] + " relative to the "
217 # "sea-floor"
218 # )
219 # xds["Bathymetry"].attrs = bath["elevation"].attrs
220 return xds, dat_extent
221
222
[docs]
223def bathymetry_layer(dat_extent, bathymetry_path):
224 """Bathymetry layer for plotting.
225
226 Parameters
227 ----------
228 dat_extent : geopandas.GeoSeries
229 Extent of the data
230 bathymetry_path : str
231 Path to the bathymetry netCDF file
232
233 Returns
234 -------
235 xarray.Dataset
236 Xarray dataset of the halite data
237 """
238 bath = xr.open_dataset(
239 os.path.join(bathymetry_path, "D4_2022.nc"), decode_coords="all"
240 )
241 bath = bath.chunk({"lat": 1000, "lon": 1000, "cdi_index_count": 1000})
242 bath = bath.rio.reproject(CRS).rio.clip(dat_extent.to_crs(CRS))
243 return bath
244
245
[docs]
246def read_shapefile_from_zip(data_path, endswith=".shp"):
247 """Read the Shapefile layer from a Zip file.
248
249 Parameters
250 ----------
251 data_path : str
252 Path to the exploration well Zip file
253 endswith : str
254 What the Shapefile's filename ends with
255
256 Returns
257 -------
258 geopandas.GeoDataFrame
259 Geodataframe of the Shapefile's data
260 """
261 data_shp = gpd.read_file(
262 os.path.join(
263 f"zip://{data_path}!"
264 + [
265 x
266 for x in ZipFile(data_path).namelist()
267 if x.endswith(endswith)
268 ][0]
269 )
270 )
271 return data_shp
272
273
[docs]
274def halite_shape(dat_xr, halite=None):
275 """Create a vector shape of the halite data.
276
277 Parameters
278 ----------
279 dat_xr : xarray.Dataset
280 Xarray dataset of the halite data
281 halite : str
282 Halite member
283
284 Returns
285 -------
286 geopandas.GeoDataFrame
287 A (multi)polygon geodataframe of the halite's shape
288 """
289 if halite:
290 shape = (
291 dat_xr.sel(halite=halite)["Thickness"]
292 .to_dataframe()
293 .dropna()
294 .reset_index()
295 )
296 else:
297 shape = (
298 dat_xr.max(dim="halite")["Thickness"]
299 .to_dataframe()
300 .dropna()
301 .reset_index()
302 )
303 shape = gpd.GeoDataFrame(
304 geometry=gpd.GeoSeries(gpd.points_from_xy(shape.x, shape.y))
305 .buffer(100)
306 .envelope,
307 crs=CRS,
308 ).dissolve()
309 return shape