"""
Wranglers for transforming various data sources to the farmnet data format.
Configuration file
--------------------------
Configuration file to map data source files to farmnet data format.
"""
# Copyright (C) 2023 OST Ostschweizer Fachhochschule
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Author: Florian Hammer <florian.hammer@ost.ch>
from pathlib import Path
import pandas as pd
from farmnet.utils import read_config
from farmnet.utils import getenv
from pathlib import Path
import os
rcConfig = dict()
def _test_global_cfg(func,*args,**kwargs):
global rcConfig
func(*args,**kwargs)
return rcConfig
[docs]
def set_default_cfg(config_path: str | Path):
"""
Setting default configuration file.
:param config_path:
Path to the configuration file. If None, the default configuration is used.
:type config_path: str | Path | None
**Examples:**
If `config_path` is `None`, `get_dataset()` returns the configuration set via :func:`~.set_default_cfg()`:
>>> import json
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> rcConfig_test = _test_global_cfg(set_default_cfg,default_cfg_path)
>>> print(json.dumps(rcConfig_test, indent=4, sort_keys=True, ensure_ascii=False))
{
"columns": [
{
"name": "wind_direction",
"name-from-source": "Wind direction (°)"
},
{
"name": "nacelle_direction",
"name-from-source": "Nacelle position (°)"
},
{
"name": "wind_speed",
"name-from-source": "Wind speed (m/s)"
},
{
"name": "power",
"name-from-source": "Power (kW)"
},
{
"name": "wt_id",
"name-from-source": "Wind turbine ID"
}
],
"csv": {
"encoding": "utf8",
"header": 0,
"sep": ","
},
"dataset": {
"data": "featured_windeurope_data.parquet",
"root_dir": "kelmarsh_data_imputation",
"static": "Kelmarsh_WT_static.csv"
},
"index": {
"name": "datetime",
"name-from-source": "# Date and time",
"time-zone": "UTC",
"time-zone-from-source": "UTC",
"unit": "ns"
}
}
"""
global rcConfig
rcConfig = read_config(config_path)
def _get_cfg_field(field: str, config_path: str | Path | None = None) -> dict:
if config_path is not None:
cfg = read_config(config_path)
cfg = cfg[field].copy()
else:
# read form default or cached config
global rcConfig
cfg = rcConfig[field].copy()
return cfg
[docs]
def get_dataset(config_path: str | Path | None = None) -> dict:
"""
Return source of a dataset.
:param config_path:
Path to the configuration file. If None, the default configuration is used.
:type config_path: str | Path | None
:return: A dictionary in the FarmNet format.
:rtype: dict
The returned dictionary contains:
- **root_dir** (`str`): Root directory of the database.
- **data** (`str`): Name of the data file.
- **static** (`str`): Name of the static file.
**Examples:**
By specifying the path to a configuration file, the dataset directory, data file, and static file is returned:
>>> import json
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> ds = get_dataset(default_cfg_path)
>>> print(json.dumps(ds, indent=4, sort_keys=True, ensure_ascii=False))
{
"data": "featured_windeurope_data.parquet",
"root_dir": "kelmarsh_data_imputation",
"static": "Kelmarsh_WT_static.csv"
}
If `config_path` is `None`, `get_dataset()` returns the database path informations of the default database
set with :func:`~.set_default_cfg()`:
>>> import json
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> set_default_cfg(default_cfg_path)
>>> ds = get_dataset()
>>> print(json.dumps(ds, indent=4, sort_keys=True, ensure_ascii=False))
{
"data": "featured_windeurope_data.parquet",
"root_dir": "kelmarsh_data_imputation",
"static": "Kelmarsh_WT_static.csv"
}
"""
return _get_cfg_field("dataset", config_path)
[docs]
def get_column_mapping(config_path: str | Path | None = None) -> dict:
"""
Retrieve the column mapping configuration.
This function reads the dataset's column mapping from the configuration file
and returns a dictionary where keys are column names from the source dataset,
and values are their corresponding standardized names.
:param config_path:
Path to the configuration file. If None, the default configuration is used.
:type config_path: str | Path | None
:return:
A dictionary mapping source column names to standardized column names.
:rtype: dict
**Examples:**
By specifying the path to a configuration file, the column mapping of file is returned:
>>> import json
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> ds = get_column_mapping(default_cfg_path)
>>> print(json.dumps(ds, indent=4, sort_keys=True, ensure_ascii=False))
{
"Nacelle position (°)": "nacelle_direction",
"Power (kW)": "power",
"Wind direction (°)": "wind_direction",
"Wind speed (m/s)": "wind_speed",
"Wind turbine ID": "wt_id"
}
If `config_path` is `None`, `get_column_mapping()` returns the csv column mapping of the default database
set with :func:`~.set_default_cfg()`:
>>> import json
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> set_default_cfg(default_cfg_path)
>>> ds = get_column_mapping()
>>> print(json.dumps(ds, indent=4, sort_keys=True, ensure_ascii=False))
{
"Nacelle position (°)": "nacelle_direction",
"Power (kW)": "power",
"Wind direction (°)": "wind_direction",
"Wind speed (m/s)": "wind_speed",
"Wind turbine ID": "wt_id"
}
"""
columns = _get_cfg_field("columns", config_path)
return {col["name-from-source"]: col["name"] for col in columns}
[docs]
def get_csv_fmt(config_path: str | Path | None = None) -> dict:
"""
Get csv format configuration.
:param config_path: Path to the configuration file. If `None`, the default configuration is used.
:type config_path: str | Path | None
:return: A dictionary containing csv formatting details.
:rtype: dict
The returned dictionary contains:
- **encoding** (`str`): Encoding to use for UTF when reading (ex. 'utf-8')
- **sep** (`str`): Character or regex pattern to treat as the delimiter.
- **header** (`int`): Row number(s) containing column labels and marking the start of the data (zero-indexed).
**Examples:**
By specifying the path to a configuration file, the csv configuration of file is returned:
>>> import json
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> ds = get_csv_fmt(default_cfg_path)
>>> print(json.dumps(ds, indent=4, sort_keys=True, ensure_ascii=False))
{
"encoding": "utf8",
"header": 0,
"sep": ","
}
If `config_path` is `None`, `get_csv_fmt()` returns the csv configuration of the default database
set with :func:`~.set_default_cfg()`:
>>> import json
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> set_default_cfg(default_cfg_path)
>>> ds = get_csv_fmt()
>>> print(json.dumps(ds, indent=4, sort_keys=True, ensure_ascii=False))
{
"encoding": "utf8",
"header": 0,
"sep": ","
}
"""
return _get_cfg_field("csv", config_path)
[docs]
def get_index_fmt(config_path: str | Path | None = None) -> dict:
"""
Retrieve index format configuration.
This function extracts index-related configuration details from a given
configuration file. If no file is provided, it uses the default configuration.
:param config_path: Path to the configuration file. If `None`, the default configuration is used.
:type config_path: str | Path | None
:return: A dictionary containing index formatting details.
:rtype: dict
The returned dictionary contains:
- **name_mapping** (`tuple[str, str]`): Mapping of source index name to target index name.
- **dt_format** (`str`): Date/time format unit.
- **tz_mapping** (`tuple[str, str]`): Mapping of source time zone to target time zone.
**Examples:**
By specifying the path to a configuration file, the index-related configuration details of file is returned:
>>> import json
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> ds = get_index_fmt(default_cfg_path)
>>> print(json.dumps(ds, indent=4, sort_keys=True, ensure_ascii=False))
{
"dt_format": "ns",
"name_mapping": [
"# Date and time",
"datetime"
],
"tz_mapping": [
"UTC",
"UTC"
]
}
If `config_path` is `None`, `get_index_fmt()` returns index-related configuration details of the default database
set with :func:`~.set_default_cfg()`:
>>> import json
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> set_default_cfg(default_cfg_path)
>>> ds = get_index_fmt()
>>> print(json.dumps(ds, indent=4, sort_keys=True, ensure_ascii=False))
{
"dt_format": "ns",
"name_mapping": [
"# Date and time",
"datetime"
],
"tz_mapping": [
"UTC",
"UTC"
]
}
"""
cfg = _get_cfg_field("index", config_path)
return {
"name_mapping": (cfg["name-from-source"], cfg["name"]),
"dt_format": cfg["unit"],
"tz_mapping": (cfg["time-zone-from-source"], cfg["time-zone"]),
}
[docs]
def read_raw(
fpath: Path | str,
*,
csv_fmt: dict | None = None,
index_fmt: dict | None = None,
**kwargs,
) -> pd.DataFrame:
"""
Read a raw data file.
:param fpath: Path to the data file.
:type fpath: Path or str
:param csv_fmt: Dictionary of CSV format options to be passed to :func:`pandas.read_csv`.
If not provided, the output of :func:`~.get_csv_fmt` is used.
:type csv_fmt: dict or None
:param index_fmt: Information about the index of the returned data.
If not provided, the output of :func:`~.get_index_fmt` is used.
:type index_fmt: dict or None
:param kwargs: Additional keyword arguments passed to :func:`pandas.read_csv`.
:return: Formatted raw data.
:rtype: :class:`pandas.DataFrame`
**Examples:**
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> set_default_cfg(default_cfg_path)
>>> download_path = Path(getenv("DOWNLOAD_PATH", "./data"))
>>> dataset = get_dataset()
>>> data_path = download_path / dataset["data"]
>>> raw_data_path = download_path / "kelmarsh_raw.csv"
>>> df_raw = read_raw(raw_data_path)
>>> print(df_raw.to_string(max_cols=5, max_rows=10))
Wind speed (m/s) Wind speed, Standard deviation (m/s) ... Tower Acceleration Y, StdDev (mm/ss) Wind turbine ID
datetime ...
2022-01-01 00:00:00+00:00 6.781222 1.182439 ... 11.422541 228
2022-01-01 00:10:00+00:00 6.936052 1.287222 ... 16.457248 228
2022-01-01 00:20:00+00:00 7.294642 1.430000 ... 16.063823 228
2022-01-01 00:30:00+00:00 8.080467 1.023509 ... 18.288907 228
2022-01-01 00:40:00+00:00 7.021328 1.066915 ... 22.059917 228
... ... ... ... ... ...
2022-12-31 23:10:00+00:00 8.712688 1.216442 ... 28.254154 233
2022-12-31 23:20:00+00:00 9.149686 1.182500 ... 15.370069 233
2022-12-31 23:30:00+00:00 9.571797 1.619526 ... 13.412479 233
2022-12-31 23:40:00+00:00 9.549912 1.504496 ... 18.748812 233
2022-12-31 23:50:00+00:00 9.215081 1.208763 ... 19.858008 233
>>> df_raw.index.name
'datetime'
>>> len(df_raw)
315360
"""
if csv_fmt is None:
csv_fmt = get_csv_fmt()
df = pd.read_csv(fpath, **csv_fmt, **kwargs)
# Set index
if index_fmt is None:
index_fmt = get_index_fmt()
df.rename(columns=dict([index_fmt["name_mapping"]]), inplace=True)
df.set_index(index_fmt["name_mapping"][1], inplace=True)
# parse dates
df.index = pd.to_datetime(df.index, unit=index_fmt["dt_format"])
# change time zone to UTC
from_, to = index_fmt["tz_mapping"]
df.index = df.index.tz_localize(from_).tz_convert(to)
return df
[docs]
def to_farmnet(df: pd.DataFrame, *, column_mapping: dict) -> pd.DataFrame:
"""
Transform a dataframe containing raw data into a FarmNet dataframe.
To be used with the FarmNet data pipeline.
The FarmNet dataframe is defined in the FarmNet data manifest and
is used as a data interface for the FarmNet data pipeline.
:param df: DataFrame containing raw data.
:type df: pd.DataFrame
:param column_mapping: Dictionary mapping raw column names to FarmNet column names.
:type column_mapping: dict
:return: A transformed FarmNet-compatible DataFrame.
:rtype: :class:`pandas.DataFrame`
**Examples:**
>>> default_cfg_path = Path(getenv("CONFIG_PATH", "examples/kelmarsh.toml"))
>>> set_default_cfg(default_cfg_path)
>>> dataset = get_dataset()
>>> download_path = Path(getenv("DOWNLOAD_PATH", "./data"))
>>> raw_data_path = download_path / "kelmarsh_raw.csv"
>>> df_raw = read_raw(raw_data_path)
>>> column_mapping = get_column_mapping()
>>> df_farmnet = to_farmnet(df_raw, column_mapping=column_mapping)
>>> print(df_farmnet.to_string(max_cols=6, max_rows=10))
wind_direction nacelle_direction wind_speed power wt_id
datetime
2022-01-01 00:00:00+00:00 185.795348 193.731354 6.781222 630.889598 228
2022-01-01 00:10:00+00:00 189.458687 193.731354 6.936052 809.339449 228
2022-01-01 00:20:00+00:00 188.648729 193.731354 7.294642 893.607333 228
2022-01-01 00:30:00+00:00 188.826550 193.731354 8.080467 995.583734 228
2022-01-01 00:40:00+00:00 191.252213 193.731354 7.021328 926.519441 228
... ... ... ... ... ...
2022-12-31 23:10:00+00:00 210.193670 205.457916 8.712688 1447.101428 233
2022-12-31 23:20:00+00:00 208.465164 205.457916 9.149686 1572.766687 233
2022-12-31 23:30:00+00:00 213.539677 205.457916 9.571797 1653.457245 233
2022-12-31 23:40:00+00:00 213.684894 205.457916 9.549912 1670.531378 233
2022-12-31 23:50:00+00:00 209.309463 205.457916 9.215081 1563.665674 233
>>> id(df_farmnet) == id(df_raw)
False
>>> list(df_farmnet.columns) == list(column_mapping.values())
True
"""
df_renamed = df.rename(columns=column_mapping)[
column_mapping.values()
].copy()
return df_renamed
if __name__ == "__main__":
import doctest
doctest.testmod(name="read_raw",optionflags=doctest.NORMALIZE_WHITESPACE)