Source code for farmnet.data.datasets.utils

# -*- coding: utf-8 -*-
"""Functions for handling PyTorch Geometric Datasets"""

from pathlib import Path
from typing import Any
import numpy as np
from torch_geometric.data import InMemoryDataset  # type: ignore

from farmnet.data.datasets.kelmarsh import KelmarshDataset

# from farmgnn.configuration import settings



[docs]
def dataset_sample(
    dataset: InMemoryDataset, sample_size: int
) -> InMemoryDataset:
    """
    Randomly samples a subset from the given dataset without replacement.

    :param dataset: The input dataset to sample from, an instance of InMemoryDataset.
    :param sample_size: The number of samples to select.
    :return: A new InMemoryDataset containing the randomly selected samples.
    
    :raises IndexError: If the sample size is greater than the dataset length.

    .. rubric:: Example

    .. code-block:: python

        >>> import numpy as np
        >>> from torch_geometric.data import InMemoryDataset
        >>> class DummyDataset(InMemoryDataset):
        ...     def __init__(self, length):
        ...         super().__init__()
        ...         self.data_list = [i for i in range(length)]
        ...     def __len__(self):
        ...         return len(self.data_list)
        ...     def copy(self, idx):
        ...         new_ds = DummyDataset(0)
        ...         new_ds.data_list = [self.data_list[i] for i in idx]
        ...         return new_ds
        >>> np.random.seed(42)
        >>> dataset = DummyDataset(10)
        >>> sampled_dataset = dataset_sample(dataset, 5)
        >>> len(sampled_dataset)
        5
        >>> all(item in dataset.data_list for item in sampled_dataset.data_list)
        True

    .. warning:: TODO replace DummyDataset with Kelmarsh 
    """
    len_dataset = len(dataset)
    if sample_size > len_dataset:
        raise IndexError(
            f"The sample size {sample_size} is greater than the dataset length {len_dataset}"
        )

    idx = np.arange(len_dataset)

    sample_idx = np.random.choice(idx, size=sample_size, replace=False)

    return dataset.copy(sample_idx)




[docs]
def load_dataset(path: str | Path) -> InMemoryDataset:
    """
    Loads the KelmarshDataset from the specified path with predefined features and target.

    :param path: The path to the dataset directory as a string or Path object.
    :return: An instance of InMemoryDataset containing the loaded data.

    .. rubric:: Example

    .. code-block:: python

        # >>> from farmnet.data.datasets.
        # >>> from pathlib import Path
        # >>> # Assuming KelmarshDataset is correctly defined and available
        # >>> dataset = load_dataset(Path("examples"))
        # >>> isinstance(dataset, InMemoryDataset)
        # True
    """
    dataset = KelmarshDataset(
        path,
        data_path=None,
        windfarm_static_path=None,
        features=["u_g", "v_g", "nacelle_direction"],
        target="wind_speed",
        wt_col="wt_id",
    )

    return dataset



#     data_path = Path(settings.dataset.data_path).expanduser().absolute()
#     dataset_dir = Path(settings.dataset.root_dir).expanduser().absolute()
#
#     config = {"graph": settings.dataset.graph, "windfarm": settings.windfarm}
#     if settings.dataset.name == "WinJiDataset":
#         return WinJiDataset(dataset_dir, data_path, config=config)
#     elif settings.dataset.name == "PyWakeDataset":
#         return PyWakeDataset(dataset_dir, data_path, config=config)
#     else:
#         raise ValueError(f"Dataset {settings.dataset.name} does not exist!")



[docs]
def train_test_split(
    dataset: InMemoryDataset, test_size: float = 0.2, seed: int = 0
) -> tuple[Any, Any]:
    """
    Splits a dataset into training and testing subsets.

    :param dataset: The input dataset to split, an instance of InMemoryDataset.
    :param test_size: The proportion of the dataset to include in the test split (default is 0.2).
    :param seed: Random seed for reproducibility (default is 0).
    :return: A tuple containing two datasets (train_dataset, test_dataset).

    .. rubric:: Example

    .. code-block:: python

        >>> import numpy as np
        >>> from torch_geometric.data import InMemoryDataset
        >>> class DummyDataset(InMemoryDataset):
        ...     def __init__(self, length):
        ...         super().__init__()
        ...         self.data_list = [i for i in range(length)]
        ...     def __len__(self):
        ...         return len(self.data_list)
        ...     def index_select(self, idx):
        ...         new_ds = DummyDataset(0)
        ...         new_ds.data_list = [self.data_list[i] for i in idx]
        ...         return new_ds
        >>> np.random.seed(0)
        >>> dataset = DummyDataset(10)
        >>> train_ds, test_ds = train_test_split(dataset, test_size=0.3, seed=42)
        >>> len(train_ds) 
        7
        >>> len(test_ds)
        3

    .. warning:: TODO replace DummyDataset with Kelmarsh 
    """
    np.random.seed(seed)
    len_dataset = len(dataset)
    idx = np.arange(len_dataset)
    train_len = int(np.round(len_dataset * (1.0 - test_size)))
    train_idx = np.random.choice(idx, size=train_len, replace=False)
    test_idx = list(set(idx).difference(train_idx))

    return dataset.index_select(train_idx), dataset.index_select(test_idx)