1"""Functions to download and read data."""
2
3import glob
4import os
5from datetime import datetime, timezone
6from zipfile import ZipFile
7
8import geopandas as gpd
9import pandas as pd
10import pooch
11import rasterio as rio
12import shapely
13import xarray as xr
14from geocube.api.core import make_geocube
15
16# CRS of the Kish Basin dat files
17CRS = 23029
18
19
[docs]
20def download_data(url, data_dir, file_name, known_hash=None):
21 """Download data and store it in the specified directory using Pooch.
22
23 Parameters
24 ----------
25 url : str
26 URL from which the data will be downloaded
27 data_dir : str
28 Directory to store the downloaded data
29 file_name : str
30 Name of the downloaded data file with its extension (not full path)
31 known_hash : str
32 SHA256 hash of downloaded file
33
34 Notes
35 -----
36 This only downloads data if necessary, i.e. if the data file does not
37 already exist in the directory.
38 """
39 data_file = os.path.join(data_dir, file_name)
40 if not os.path.isfile(data_file):
41 os.makedirs(data_dir, exist_ok=True)
42 pooch.retrieve(
43 url=url, known_hash=known_hash, fname=file_name, path=data_dir
44 )
45 print(f"Data downloaded on: {datetime.now(tz=timezone.utc)}")
46 with open(f"{data_file}.txt", "w", encoding="utf-8") as outfile:
47 outfile.write(
48 f"Data downloaded on: {datetime.now(tz=timezone.utc)}\n"
49 f"Download URL: {url}\n"
50 f"SHA256 hash: {pooch.file_hash(data_file)}\n"
51 )
52 else:
53 print(f"Data '{file_name}' already exists in '{data_dir}'.")
54 with open(f"{data_file}.txt", encoding="utf-8") as f:
55 print(f.read())
56
57
[docs]
58def kish_basin_extent(dat_path):
59 """Read the extent of the Kish Basin data.
60
61 Parameters
62 ----------
63 dat_path : str
64 Path to the Kish Basin data files
65
66 Returns
67 -------
68 geopandas.GeoSeries
69 GeoPandas geoseries of the extent polygon
70 """
71 dat_extent = pd.read_csv(
72 os.path.join(dat_path, "Kish GIS Map Extent - Square.csv"), skiprows=2
73 )
74 dat_extent = gpd.GeoSeries(
75 shapely.geometry.Polygon(
76 [
77 (dat_extent[" X"][0], dat_extent[" Y"][0]),
78 (dat_extent[" X"][1], dat_extent[" Y"][1]),
79 (dat_extent[" X"][2], dat_extent[" Y"][2]),
80 (dat_extent[" X"][3], dat_extent[" Y"][3]),
81 ]
82 ),
83 crs=CRS,
84 )
85 return dat_extent
86
87
[docs]
88def read_dat_file(dat_path):
89 """Read XYZ halite data layers into an Xarray dataset.
90
91 Parameters
92 ----------
93 dat_path : str
94 Path to the DAT (ASCII XYZ) files
95
96 Returns
97 -------
98 tuple[xarray.Dataset, geopandas.GeoSeries]
99 Xarray dataset of the XYZ data and GeoPandas geoseries of the extent
100 """
101 gdf = {}
102 # for dat_file in glob.glob(os.path.join(dat_path, "*.dat")):
103 for dat_file in [
104 x
105 for x in glob.glob(os.path.join(dat_path, "*.dat"))
106 if "Zone" not in x
107 ]:
108 # read each layer as individual dataframes into a dictionary
109 gdf[os.path.split(dat_file)[-1][:-4]] = pd.read_fwf(
110 dat_file, header=None, names=["X", "Y", "Z"]
111 )
112
113 # assign layer name to a column
114 gdf[os.path.split(dat_file)[-1][:-4]]["data"] = os.path.split(
115 dat_file
116 )[-1][:-4]
117
118 # # find data resolution
119 # gdf_xr = gdf[list(gdf.keys())[0]].set_index(["X", "Y"]).to_xarray()
120 # resx = gdf_xr["X"][1] - gdf_xr["X"][0]
121 # resy = gdf_xr["Y"][1] - gdf_xr["Y"][0]
122
123 # combine dataframes
124 gdf = pd.concat(gdf.values())
125
126 # convert dataframe to geodataframe
127 gdf["geometry"] = gpd.points_from_xy(gdf.X, gdf.Y, crs=CRS)
128 gdf = gpd.GeoDataFrame(gdf)
129 gdf.drop(columns=["X", "Y"], inplace=True)
130
131 # convert to Xarray dataset
132 xds = make_geocube(
133 vector_data=gdf,
134 # resolution=(-abs(resy), abs(resx)),
135 # align=(abs(resy / 2), abs(resx / 2)),
136 resolution=(-200, 200),
137 align=(100, 100),
138 group_by="data",
139 )
140
141 # split variables and halite members
142 xds_ = {}
143 for d in xds["data"].values:
144 halite_member = d.split(" ")[0]
145 # fix halite names
146 if halite_member == "Presall":
147 halite_member = "Preesall"
148 elif halite_member == "Flyde":
149 halite_member = "Fylde"
150 # unit = d.split(" ")[-1]
151 zvar = d.split("Halite ")[-1].split(" XYZ")[0]
152 xds_[d] = (
153 xds.sel(data=d)
154 .assign_coords(halite=halite_member)
155 .expand_dims(dim="halite")
156 .drop_vars("data")
157 )
158 xds_[d] = xds_[d].rename({"Z": zvar.replace(" ", "")})
159 # xds_[d][zvar.replace(" ", "")] = xds_[d][
160 # zvar.replace(" ", "")
161 # ].assign_attrs(units=unit, long_name=zvar)
162
163 xds = xr.combine_by_coords(xds_.values(), combine_attrs="override")
164
165 # # keep only points corresponding to zones of interest in the dataframe
166 # zones = gdf.loc[gdf["data"].str.contains("Zone")]
167
168 # # create zones of interest polygon
169 # zones = gpd.GeoDataFrame(geometry=zones.buffer(100).envelope).dissolve()
170
171 # create extent polygon
172 dat_extent = kish_basin_extent(dat_path=dat_path)
173
174 return xds, dat_extent # zones
175
176
[docs]
177def kish_basin_data_depth_adjusted(dat_path, bathymetry_path):
178 """Read halite data with adjusted depths and its extent.
179
180 Parameters
181 ----------
182 dat_path : str
183 Path to the DAT (ASCII XYZ) files
184 bathymetry_path : str
185 Path to the bathymetry netCDF file
186
187 Returns
188 -------
189 tuple[xarray.Dataset, geopandas.GeoSeries]
190 Xarray dataset of the halite data and GeoPandas geoseries of the extent
191
192 Notes
193 -----
194 A hacky workaround was used to prevent multiple grid mappings in the
195 resulting Xarray dataset by first assigning a variable of the desired
196 mapping and then multiplying it by zero prior to involving variables which
197 originally had a different mapping.
198 """
199 bath = xr.open_dataset(
200 os.path.join(bathymetry_path, "D4_2022.nc"), decode_coords="all"
201 )
202 bath = bath.chunk({"lat": 1000, "lon": 1000, "cdi_index_count": 1000})
203 xds, dat_extent = read_dat_file(dat_path=dat_path)
204 bath = bath.rename({"lon": "x", "lat": "y"}).rio.reproject_match(
205 xds, resampling=rio.enums.Resampling.bilinear
206 )
207 xds = xds.assign(TopDepthSeabed=xds["TopDepth"] + bath["elevation"])
208 xds = xds.assign(BaseDepthSeabed=xds["BaseDepth"] + bath["elevation"])
209 # hacky way to prevent multiple grid mappings
210 xds = xds.assign(Bathymetry=xds["TopDepth"] * 0 + bath["elevation"])
211 # for v in ["TopDepth", "BaseDepth"]:
212 # xds[f"{v}Seabed"].attrs = xds[v].attrs
213 # xds[f"{v}Seabed"].attrs["long_name"] = (
214 # xds[f"{v}Seabed"].attrs["long_name"] + " relative to the "
215 # "sea-floor"
216 # )
217 # xds["Bathymetry"].attrs = bath["elevation"].attrs
218 return xds, dat_extent
219
220
[docs]
221def bathymetry_layer(dat_extent, bathymetry_path):
222 """Bathymetry layer for plotting.
223
224 Parameters
225 ----------
226 dat_extent : geopandas.GeoSeries
227 Extent of the data
228 bathymetry_path : str
229 Path to the bathymetry netCDF file
230
231 Returns
232 -------
233 xarray.Dataset
234 Xarray dataset of the halite data
235 """
236 bath = xr.open_dataset(
237 os.path.join(bathymetry_path, "D4_2022.nc"), decode_coords="all"
238 )
239 bath = bath.chunk({"lat": 1000, "lon": 1000, "cdi_index_count": 1000})
240 bath = bath.rio.reproject(CRS).rio.clip(dat_extent.to_crs(CRS))
241 return bath
242
243
[docs]
244def read_shapefile_from_zip(data_path, endswith=".shp"):
245 """Read the Shapefile layer from a Zip file.
246
247 Parameters
248 ----------
249 data_path : str
250 Path to the exploration well Zip file
251 endswith : str
252 What the Shapefile's filename ends with
253
254 Returns
255 -------
256 geopandas.GeoDataFrame
257 Geodataframe of the Shapefile's data
258 """
259 data_shp = gpd.read_file(
260 os.path.join(
261 f"zip://{data_path}!"
262 + [
263 x
264 for x in ZipFile(data_path).namelist()
265 if x.endswith(endswith)
266 ][0]
267 )
268 )
269 return data_shp
270
271
[docs]
272def halite_shape(dat_xr, halite=None):
273 """Create a vector shape of the halite data.
274
275 Parameters
276 ----------
277 dat_xr : xarray.Dataset
278 Xarray dataset of the halite data
279 halite : str
280 Halite member
281
282 Returns
283 -------
284 geopandas.GeoDataFrame
285 A (multi)polygon geodataframe of the halite's shape
286 """
287 if halite:
288 shape = (
289 dat_xr.sel(halite=halite)["Thickness"]
290 .to_dataframe()
291 .dropna()
292 .reset_index()
293 )
294 else:
295 shape = (
296 dat_xr.max(dim="halite")["Thickness"]
297 .to_dataframe()
298 .dropna()
299 .reset_index()
300 )
301 shape = gpd.GeoDataFrame(
302 geometry=gpd.GeoSeries(gpd.points_from_xy(shape.x, shape.y))
303 .buffer(100)
304 .envelope,
305 crs=CRS,
306 ).dissolve()
307 return shape