import shutil
from hashlib import md5
from os import environ, makedirs
from pathlib import Path
import requests
from tqdm import tqdm
[docs]
def get_data_home_folder(data_home_folder: str | Path = None) -> Path:
"""
Returns the absolute path to the data home folder. If no path is provided, the function
uses the `WEID_DATA_PATH` environment variable. If that is not set, it defaults to
`~/.weid_data`. The directory is created if it does not exist.
:param data_home_folder: Optional custom path to the data folder.
:type data_home_folder: str or Path, optional
:return: Absolute path to the data home folder.
:rtype: Path
.. rubric:: Example
.. code-block:: python
>>> from farmnet.data.download import get_data_home_folder
>>> from pathlib import Path
>>> import pathlib
>>> import os
.. code-block:: python
>>> from pathlib import Path
>>> path = get_data_home_folder()
>>> isinstance(path, Path)
True
>>> path.exists()
True
.. code-block:: python
>>> test_path = pathlib.Path().resolve() / Path("test_folder")
>>> path = get_data_home_folder(test_path)
>>> path == test_path
True
>>> path.exists()
True
>>> os.rmdir(test_path)
"""
if data_home_folder is None:
data_home_folder = environ.get(
"WEID_DATA_PATH", (Path.home() / ".weid_data")
)
data_home_folder = Path(data_home_folder).absolute()
makedirs(data_home_folder, exist_ok=True)
return data_home_folder
[docs]
def download_from_url(url: str, download_path: str | Path):
"""
Downloads a file from a given URL and saves it to the specified path with a progress bar.
This function performs a streaming HTTP GET request to download a file from the provided URL.
It saves the file to the given local path while displaying a progress bar using `tqdm`.
:param url: The URL of the file to download.
:type url: str
:param download_path: The local path where the downloaded file will be saved.
Can be a string or a pathlib.Path.
:type download_path: str | pathlib.Path
:return: None
:rtype: None
:raises HTTPError: If the HTTP request returned an unsuccessful status code.
:raises OSError: If the file cannot be written to the specified path.
.. rubric:: Example
.. code-block:: python
>>> from pathlib import Path
>>> from farmnet.data.download import download_from_url
>>> import os
>>> import requests
.. code-block:: python
>>> path = Path("Kelmarsh_WT_static.csv")
>>> download_from_url(
... "https://zenodo.org/api/records/8252025/files/Kelmarsh_WT_static.csv/content",
... path
... )
>>> path.exists()
True
>>> os.remove(path)
.. code-block:: python
>>> path = Path("Kelmarsh_WT_static.csv")
>>> try:
... download_from_url(
... "https://zenodo.ch/api/records/8252025/files/Kelmarsh_WT_static.csv/content",
... path
... )
... except requests.exceptions.ConnectionError:
... pass
>>> path.exists()
False
"""
r = requests.get(url, stream=True, headers={"Accept-Encoding": None})
r.raise_for_status()
file_size = int(r.headers.get("content-length", 0))
download_path = Path(download_path).absolute()
desc = f"Downloading {url}"
with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
with download_path.open("wb") as f:
shutil.copyfileobj(r_raw, f)
[docs]
def checksum_ok(filename: str | Path, remote_checksum: str):
"""
Verifies whether a local file's MD5 checksum matches a given remote checksum.
This function reads the contents of a file, computes its MD5 checksum, and compares
it to the provided checksum string. If the checksums do not match, a warning message
is printed to the console.
:param filename: Path to the file to verify. Can be a string or a Path object.
:type filename: str | pathlib.Path
:param remote_checksum: The expected checksum string to compare against.
:type remote_checksum: str
:return: None
:rtype: None
.. rubric:: Example
.. code-block:: python
>>> from pathlib import Path
>>> from farmnet.data.download import checksum_ok
>>> from farmnet.utils import getenv
Correct checksum:
.. code-block:: python
>>> file_path = Path(getenv("DOWNLOAD_PATH", "./data")) / 'Kelmarsh_WT_static.csv'
>>> remote_checksum = 'af3a038f0f7fddfc1608ad0bfc8cf5ba'
>>> checksum_ok(file_path, remote_checksum)
False checksum:
.. code-block:: python
>>> file_path = Path(getenv("DOWNLOAD_PATH", "./data")) / 'Kelmarsh_WT_static.csv'
>>> remote_checksum = 'bf3a038f0f7fddfc1608ad0bfc8cf5ba'
>>> checksum_ok(file_path, remote_checksum)
=================================================
Checksum wrong for Kelmarsh_WT_static.csv
af3a038f0f7fddfc1608ad0bfc8cf5ba != bf3a038f0f7fddfc1608ad0bfc8cf5ba
=================================================
"""
filename = Path(filename)
with open(filename, "rb") as myzip:
checksum = md5(myzip.read()).hexdigest()
if not checksum == remote_checksum:
print("=================================================")
print(f"Checksum wrong for {filename.name}")
print(f"{checksum} != {remote_checksum}")
print("=================================================")
[docs]
def substring_match(include: list[str], string: str) -> str | None:
"""
Checks whether any of the substrings in a list are present in a given string.
Iterates over the list of substrings and returns the input string if any substring
is found within it. Returns `None` if no matches are found.
:param include: List of substrings to check for.
:type include: list[str]
:param string: The target string to search within.
:type string: str
:return: The original string if a match is found; otherwise, None.
:rtype: str | None
.. rubric:: Example
.. code-block:: python
>>> from farmnet.data.download import substring_match
.. code-block:: python
>>> include = ['Kelmarsh_WT_static.csv', 'Kelmarsh_SCADA_2022_4457.zip']
>>> substring_match(include, 'Kelmarsh_SCADA_2022_4457.zip')
'Kelmarsh_SCADA_2022_4457.zip'
>>> substring_match(include, 'Test.zip')
"""
for inc in include:
if inc in string:
return string
return None
[docs]
def zenodo_download(
id: str | int,
download_folder: str | Path,
force_download: bool = False,
include: list[str] | None = None,
exclude: list[str] | None = None,
):
"""
Downloads files from a Zenodo record.
Retrieves files associated with a Zenodo record by ID and downloads them to the specified folder.
Files can be filtered using inclusion or exclusion patterns. The file is only downloaded if it
does not already exist unless `force_download` is set to True.
:param id: The Zenodo record ID.
:type id: str or int
:param download_folder: Path to the folder where files will be downloaded.
:type download_folder: str or Path
:param force_download: If True, re-download files even if they already exist.
:type force_download: bool, optional
:param include: List of substrings; only files that match one of them will be downloaded.
:type include: list[str], optional
:param exclude: List of substrings; files that match one of them will be skipped.
:type exclude: list[str], optional
:raises requests.HTTPError: If the Zenodo API request fails.
:raises ValueError: If downloaded file checksum does not match the remote checksum.
.. rubric:: Example
.. code-block:: python
>>> import os
>>> from pathlib import Path
>>> from farmnet.utils import getenv
>>> from farmnet.data.download import zenodo_download
.. code-block:: python
>>> download_path = Path(getenv("DOWNLOAD_PATH", "./data"))
>>> file_name = Path("Kelmarsh_WT_static.csv")
>>> zenodo_download(
... 8252025,
... download_path,
... force_download=False,
... include=["Kelmarsh_WT_static.csv"],
... )
>>> file_path = download_path / file_name
>>> file_path.exists()
True
>>> os.remove(file_path)
"""
url_zenodo = r"https://zenodo.org/api/records/"
r = requests.get(f"{url_zenodo}{id}")
r.raise_for_status()
metadata = r.json()
for file in metadata["files"]:
file_key = file["key"]
if include:
file_key = substring_match(include, file_key)
if file_key is None:
continue
if exclude:
file_key_exclude = substring_match(exclude, file_key)
if file_key_exclude is not None:
continue
filename = Path(download_folder).expanduser().absolute() / file_key
if not filename.exists() or force_download:
download_from_url(file["links"]["self"], filename)
remote_checksum = file["checksum"].split(":")[-1]
checksum_ok(filename, remote_checksum)