Source code for muse.readers.csv

"""Ensemble of functions to read MUSE data.

In general, there are three functions per input file:

- ``read_x``: This is the overall function that is called to read the data. It takes a
    ``Path`` as input, and returns the relevant data structure (usually an xarray). The
    process is generally broken down into two functions that are called by ``read_x``:

- ``read_x_csv``: This takes a path to a csv file as input and returns a pandas
    DataFrame. There are some consistency checks, such as checking data types and
    columns. There is also some minor processing at this stage, such as standardising
    column names, but no structural changes to the data. The general rule is that
    anything returned by this function should still be valid as an input file if saved
    to CSV.

- ``process_x``: This is where more major processing and reformatting of the data is
    done. It takes the DataFrame from ``read_x_csv`` and returns the final data
    structure (usually an xarray). There are also some more checks (e.g. checking for
    NaN values).

Most of the processing is shared by a few helper functions:
- read_csv: reads a csv file and returns a dataframe
- standardize_dataframe: standardizes the dataframe to a common format
- create_multiindex: creates a multiindex from a dataframe
- create_xarray_dataset: creates an xarray dataset from a dataframe

A few other helpers perform common operations on xarrays:
- create_assets: creates assets from technologies
- check_commodities: checks commodities and fills missing values

"""

__all__ = [
    "read_agent_parameters",
    "read_attribute_table",
    "read_csv",
    "read_existing_trade",
    "read_global_commodities",
    "read_initial_capacity",
    "read_initial_market",
    "read_io_technodata",
    "read_macro_drivers",
    "read_presets",
    "read_regression_parameters",
    "read_technodata_timeslices",
    "read_technodictionary",
    "read_technologies",
    "read_timeslice_shares",
    "read_trade_technodata",
]

from logging import getLogger
from pathlib import Path

import pandas as pd
import xarray as xr

from muse.utilities import camel_to_snake

# Global mapping of column names to their standardized versions
# This is for backwards compatibility with old file formats
COLUMN_RENAMES = {
    "process_name": "technology",
    "process": "technology",
    "sector_name": "sector",
    "region_name": "region",
    "time": "year",
    "commodity_name": "commodity",
    "comm_type": "commodity_type",
    "commodity_price": "prices",
    "units_commodity_price": "units_prices",
    "enduse": "end_use",
    "sn": "timeslice",
    "commodity_emission_factor_CO2": "emmission_factor",
    "utilisation_factor": "utilization_factor",
    "objsort": "obj_sort",
    "objsort1": "obj_sort1",
    "objsort2": "obj_sort2",
    "objsort3": "obj_sort3",
    "time_slice": "timeslice",
    "price": "prices",
}

# Columns who's values should be converted from camelCase to snake_case
CAMEL_TO_SNAKE_COLUMNS = [
    "tech_type",
    "commodity",
    "commodity_type",
    "agent_share",
    "attribute",
    "sector",
    "region",
    "parameter",
]

# Global mapping of column names to their expected types
COLUMN_TYPES = {
    "year": int,
    "region": str,
    "technology": str,
    "commodity": str,
    "sector": str,
    "attribute": str,
    "variable": str,
    "timeslice": int,  # For tables that require int timeslice instead of month etc.
    "name": str,
    "commodity_type": str,
    "tech_type": str,
    "type": str,
    "function_type": str,
    "level": str,
    "search_rule": str,
    "decision_method": str,
    "quantity": float,
    "share": float,
    "coeff": str,
    "value": float,
    "utilization_factor": float,
    "minimum_service_factor": float,
    "maturity_threshold": float,
    "spend_limit": float,
    "prices": float,
    "emmission_factor": float,
}

DEFAULTS = {
    "cap_par": 0,
    "cap_exp": 1,
    "fix_par": 0,
    "fix_exp": 1,
    "var_par": 0,
    "var_exp": 1,
    "interest_rate": 0,
    "utilization_factor": 1,
    "minimum_service_factor": 0,
    "search_rule": "all",
    "decision_method": "single",
    "growth_seed": 1.0,
}


def standardize_columns(data: pd.DataFrame) -> pd.DataFrame:
    """Standardizes column names in a DataFrame.

    This function:
    1. Converts column names to snake_case
    2. Applies the global COLUMN_RENAMES mapping
    3. Preserves any columns not in the mapping

    Args:
        data: DataFrame to standardize

    Returns:
        DataFrame with standardized column names
    """
    # Drop index column if present
    if data.columns[0] == "" or data.columns[0].startswith("Unnamed"):
        data = data.iloc[:, 1:]

    # Convert columns to snake_case
    data = data.rename(columns=camel_to_snake)

    # Then apply global mapping
    data = data.rename(columns=COLUMN_RENAMES)

    # Make sure there are no duplicate columns
    if len(data.columns) != len(set(data.columns)):
        raise ValueError(f"Duplicate columns in {data.columns}")

    return data


def create_multiindex(
    data: pd.DataFrame,
    index_columns: list[str],
    index_names: list[str],
    drop_columns: bool = True,
) -> pd.DataFrame:
    """Creates a MultiIndex from specified columns.

    Args:
        data: DataFrame to create index from
        index_columns: List of column names to use for index
        index_names: List of names for the index levels
        drop_columns: Whether to drop the original columns

    Returns:
        DataFrame with new MultiIndex
    """
    index = pd.MultiIndex.from_arrays(
        [data[col] for col in index_columns], names=index_names
    )
    result = data.copy()
    result.index = index
    if drop_columns:
        result = result.drop(columns=index_columns)
    return result


def create_xarray_dataset(
    data: pd.DataFrame,
    disallow_nan: bool = True,
) -> xr.Dataset:
    """Creates an xarray Dataset from a DataFrame with standardized options.

    Args:
        data: DataFrame to convert
        disallow_nan: Whether to raise an error if NaN values are found

    Returns:
        xarray Dataset
    """
    result = xr.Dataset.from_dataframe(data)
    if disallow_nan:
        nan_coords = get_nan_coordinates(result)
        if nan_coords:
            raise ValueError(f"Missing data for coordinates: {nan_coords}")

    if "year" in result.coords:
        result = result.assign_coords(year=result.year.astype(int))
        result = result.sortby("year")
        assert len(set(result.year.values)) == result.year.data.size  # no duplicates

    return result


def get_nan_coordinates(dataset: xr.Dataset) -> list[tuple]:
    """Get coordinates of a Dataset where any data variable has NaN values."""
    any_nan = sum(var.isnull() for var in dataset.data_vars.values())
    if any_nan.any():
        return any_nan.where(any_nan, drop=True).to_dataframe(name="").index.to_list()
    return []


def convert_column_types(data: pd.DataFrame) -> pd.DataFrame:
    """Converts DataFrame columns to their expected types.

    Args:
        data: DataFrame to convert

    Returns:
        DataFrame with converted column types
    """
    result = data.copy()
    for column, expected_type in COLUMN_TYPES.items():
        if column in result.columns:
            try:
                if expected_type is int:
                    result[column] = pd.to_numeric(result[column], downcast="integer")
                elif expected_type is float:
                    result[column] = pd.to_numeric(result[column]).astype(float)
                elif expected_type is str:
                    result[column] = result[column].astype(str)
            except (ValueError, TypeError) as e:
                raise ValueError(
                    f"Could not convert column '{column}' to {expected_type.__name__}: {e}"  # noqa: E501
                )
    return result


def standardize_dataframe(
    data: pd.DataFrame,
    required_columns: list[str] | None = None,
    exclude_extra_columns: bool = False,
) -> pd.DataFrame:
    """Standardizes a DataFrame to a common format.

    Args:
        data: DataFrame to standardize
        required_columns: List of column names that must be present (optional)
        exclude_extra_columns: If True, exclude any columns not in required_columns list
            (optional). This can be important if extra columns can mess up the resulting
            xarray object.

    Returns:
        DataFrame containing the standardized data
    """
    if required_columns is None:
        required_columns = []

    # Standardize column names
    data = standardize_columns(data)

    # Convert specified column values from camelCase to snake_case
    for col in CAMEL_TO_SNAKE_COLUMNS:
        if col in data.columns:
            data[col] = data[col].apply(camel_to_snake)

    # Fill missing values with defaults
    data = data.fillna(DEFAULTS)
    for col, default in DEFAULTS.items():
        if col not in data.columns and col in required_columns:
            data[col] = default

    # Check/convert data types
    data = convert_column_types(data)

    # Validate required columns if provided
    if required_columns:
        missing_columns = [col for col in required_columns if col not in data.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        # Exclude extra columns if requested
        if exclude_extra_columns:
            data = data[list(required_columns)]

    return data


[docs] def read_csv( path: Path, float_precision: str = "high", required_columns: list[str] | None = None, exclude_extra_columns: bool = False, msg: str | None = None, ) -> pd.DataFrame: """Reads and standardizes a CSV file into a DataFrame. Args: path: Path to the CSV file float_precision: Precision to use when reading floats required_columns: List of column names that must be present (optional) exclude_extra_columns: If True, exclude any columns not in required_columns list (optional). This can be important if extra columns can mess up the resulting xarray object. msg: Message to log (optional) Returns: DataFrame containing the standardized data """ # Log message if msg: getLogger(__name__).info(msg) # Check if file exists if not path.is_file(): raise OSError(f"{path} does not exist.") # Check if there's a units row (in which case we need to skip it) with open(path) as f: next(f) # Skip header row first_data_row = f.readline().strip() skiprows = [1] if first_data_row.startswith("Unit") else None # Read the file data = pd.read_csv( path, float_precision=float_precision, low_memory=False, skiprows=skiprows, ) # Standardize the DataFrame return standardize_dataframe( data, required_columns=required_columns, exclude_extra_columns=exclude_extra_columns, )
def check_commodities( data: xr.Dataset | xr.DataArray, fill_missing: bool = True, fill_value: float = 0 ) -> xr.Dataset | xr.DataArray: """Validates and optionally fills missing commodities in data.""" from muse.commodities import COMMODITIES # Make sure there are no commodities in data but not in global commodities extra_commodities = [ c for c in data.commodity.values if c not in COMMODITIES.commodity.values ] if extra_commodities: raise ValueError( "The following commodities were not found in global commodities file: " f"{extra_commodities}" ) # Add any missing commodities with fill_value if fill_missing: data = data.reindex( commodity=COMMODITIES.commodity.values, fill_value=fill_value ) return data def create_assets(data: xr.DataArray | xr.Dataset) -> xr.DataArray | xr.Dataset: """Creates assets from technology data.""" # Rename technology to asset result = data.drop_vars("technology").rename(technology="asset") result["technology"] = "asset", data.technology.values # Add installed year result["installed"] = ("asset", [int(result.year.min())] * len(result.technology)) return result
[docs] def read_technodictionary(path: Path) -> xr.Dataset: """Reads and processes technodictionary data from a CSV file.""" df = read_technodictionary_csv(path) return process_technodictionary(df)
def read_technodictionary_csv(path: Path) -> pd.DataFrame: """Reads and formats technodata into a DataFrame.""" required_columns = { "cap_exp", "region", "var_par", "fix_exp", "interest_rate", "utilization_factor", "minimum_service_factor", "year", "cap_par", "var_exp", "technology", "technical_life", "fix_par", "growth_seed", } data = read_csv( path, required_columns=required_columns, msg=f"Reading technodictionary from {path}.", ) # Check for deprecated columns if "fuel" in data.columns: msg = ( f"The 'fuel' column in {path} has been deprecated. " "This information is now determined from CommIn files. " "Please remove this column from your Technodata files." ) getLogger(__name__).warning(msg) if "end_use" in data.columns: msg = ( f"The 'end_use' column in {path} has been deprecated. " "This information is now determined from CommOut files. " "Please remove this column from your Technodata files." ) getLogger(__name__).warning(msg) if "scaling_size" in data.columns: msg = ( f"The 'scaling_size' column in {path} has been deprecated. " "Please remove this column from your Technodata files." ) getLogger(__name__).warning(msg) return data def process_technodictionary(data: pd.DataFrame) -> xr.Dataset: """Processes technodictionary DataFrame into an xarray Dataset.""" # Create multiindex for technology and region data = create_multiindex( data, index_columns=["technology", "region", "year"], index_names=["technology", "region", "year"], drop_columns=True, ) # Create dataset result = create_xarray_dataset(data) # Handle tech_type if present if "type" in result.variables: result["tech_type"] = result.type.isel(region=0, year=0) return result
[docs] def read_technodata_timeslices(path: Path) -> xr.Dataset: """Reads and processes technodata timeslices from a CSV file.""" df = read_technodata_timeslices_csv(path) return process_technodata_timeslices(df)
def read_technodata_timeslices_csv(path: Path) -> pd.DataFrame: """Reads and formats technodata timeslices into a DataFrame.""" from muse.timeslices import TIMESLICE timeslice_columns = set(TIMESLICE.coords["timeslice"].indexes["timeslice"].names) required_columns = { "utilization_factor", "technology", "minimum_service_factor", "region", "year", } | timeslice_columns return read_csv( path, required_columns=required_columns, exclude_extra_columns=True, msg=f"Reading technodata timeslices from {path}.", ) def process_technodata_timeslices(data: pd.DataFrame) -> xr.Dataset: """Processes technodata timeslices DataFrame into an xarray Dataset.""" from muse.timeslices import TIMESLICE, sort_timeslices # Create multiindex for all columns except factor columns factor_columns = ["utilization_factor", "minimum_service_factor", "obj_sort"] index_columns = [col for col in data.columns if col not in factor_columns] data = create_multiindex( data, index_columns=index_columns, index_names=index_columns, drop_columns=True, ) # Create dataset result = create_xarray_dataset(data) # Stack timeslice levels (month, day, hour) into a single timeslice dimension timeslice_levels = TIMESLICE.coords["timeslice"].indexes["timeslice"].names if all(level in result.dims for level in timeslice_levels): result = result.stack(timeslice=timeslice_levels) return sort_timeslices(result)
[docs] def read_io_technodata(path: Path) -> xr.Dataset: """Reads and processes input/output technodata from a CSV file.""" df = read_io_technodata_csv(path) return process_io_technodata(df)
def read_io_technodata_csv(path: Path) -> pd.DataFrame: """Reads process inputs or outputs into a DataFrame.""" data = read_csv( path, required_columns=["technology", "region", "year"], msg=f"Reading IO technodata from {path}.", ) # Unspecified Level values default to "fixed" if "level" in data.columns: data["level"] = data["level"].fillna("fixed") else: # Particularly relevant to outputs files where the Level column is omitted by # default, as only "fixed" outputs are allowed. data["level"] = "fixed" return data def process_io_technodata(data: pd.DataFrame) -> xr.Dataset: """Processes IO technodata DataFrame into an xarray Dataset.""" from muse.commodities import COMMODITIES # Extract commodity columns commodities = [c for c in data.columns if c in COMMODITIES.commodity.values] # Convert commodity columns to long format (i.e. single "commodity" column) data = data.melt( id_vars=["technology", "region", "year", "level"], value_vars=commodities, var_name="commodity", value_name="value", ) # Pivot data to create fixed and flexible columns data = data.pivot( index=["technology", "region", "year", "commodity"], columns="level", values="value", ) # Create xarray dataset result = create_xarray_dataset(data) # Ensure both `fixed` and `flexible` inputs/outputs are defined. If only one is # defined in the input data, create the other as a zeros array with the same shape. has_fixed = "fixed" in result.data_vars has_flexible = "flexible" in result.data_vars if has_fixed and not has_flexible: result["flexible"] = xr.zeros_like(result.fixed).rename("flexible") elif has_flexible and not has_fixed: result["fixed"] = xr.zeros_like(result.flexible).rename("fixed") elif not has_fixed and not has_flexible: raise ValueError("Neither 'fixed' nor 'flexible' levels were found.") # Fill any NaNs with zero result["fixed"] = result.fixed.fillna(0) result["flexible"] = result.flexible.fillna(0) # Check commodities result = check_commodities(result, fill_missing=True, fill_value=0) return result
[docs] def read_technologies( technodata_path: Path, comm_out_path: Path, comm_in_path: Path, time_framework: list[int], interpolation_mode: str = "linear", technodata_timeslices_path: Path | None = None, ) -> xr.Dataset: """Reads and processes technology data from multiple CSV files. Will also interpolate data to the time framework if provided. Args: technodata_path: path to the technodata file comm_out_path: path to the comm_out file comm_in_path: path to the comm_in file time_framework: list of years to interpolate data to interpolation_mode: Interpolation mode to use technodata_timeslices_path: path to the technodata_timeslices file Returns: xr.Dataset: Dataset containing the processed technology data. Any fields that differ by year will contain a "year" dimension interpolated to the time framework. Other fields will not have a "year" dimension. """ # Read all data technodata = read_technodictionary(technodata_path) comm_out = read_io_technodata(comm_out_path) comm_in = read_io_technodata(comm_in_path) technodata_timeslices = ( read_technodata_timeslices(technodata_timeslices_path) if technodata_timeslices_path else None ) # Assemble xarray Dataset return process_technologies( technodata, comm_out, comm_in, time_framework, interpolation_mode, technodata_timeslices, )
def process_technologies( technodata: xr.Dataset, comm_out: xr.Dataset, comm_in: xr.Dataset, time_framework: list[int], interpolation_mode: str = "linear", technodata_timeslices: xr.Dataset | None = None, ) -> xr.Dataset: """Processes technology data DataFrames into an xarray Dataset.""" from muse.commodities import COMMODITIES, CommodityUsage from muse.timeslices import drop_timeslice from muse.utilities import interpolate_technodata # Process inputs/outputs ins = comm_in.rename(flexible="flexible_inputs", fixed="fixed_inputs") outs = comm_out.rename(flexible="flexible_outputs", fixed="fixed_outputs") # Legacy: Remove flexible outputs if not (outs["flexible_outputs"] == 0).all(): raise ValueError( "'flexible' outputs are not permitted. All outputs must be 'fixed'" ) outs = outs.drop_vars("flexible_outputs") # Collect all years from the time framework and data files time_framework = list( set(time_framework).union( technodata.year.values.tolist(), ins.year.values.tolist(), outs.year.values.tolist(), technodata_timeslices.year.values.tolist() if technodata_timeslices else [], ) ) # Interpolate data to match the time framework technodata = interpolate_technodata(technodata, time_framework, interpolation_mode) outs = interpolate_technodata(outs, time_framework, interpolation_mode) ins = interpolate_technodata(ins, time_framework, interpolation_mode) if technodata_timeslices: technodata_timeslices = interpolate_technodata( technodata_timeslices, time_framework, interpolation_mode ) # Merge inputs/outputs with technodata technodata = technodata.merge(outs, join="outer").merge(ins, join="outer") # Merge technodata_timeslices if provided. This will prioritise values defined in # technodata_timeslices, and fallback to the non-timesliced technodata for any # values that are not defined in technodata_timeslices. if technodata_timeslices: technodata["utilization_factor"] = ( technodata_timeslices.utilization_factor.combine_first( technodata.utilization_factor ) ) technodata["minimum_service_factor"] = drop_timeslice( technodata_timeslices.minimum_service_factor.combine_first( technodata.minimum_service_factor ) ) # Check commodities technodata = check_commodities(technodata, fill_missing=False) # Add info about commodities technodata = technodata.merge( COMMODITIES.sel(commodity=technodata.commodity), join="outer" ) # Add commodity usage flags technodata["comm_usage"] = ( "commodity", CommodityUsage.from_technologies(technodata).values, ) technodata = technodata.drop_vars("commodity_type") # Check utilization and minimum service factors check_utilization_and_minimum_service_factors(technodata) return technodata
[docs] def read_initial_capacity(path: Path) -> xr.DataArray: """Reads and processes initial capacity data from a CSV file.""" df = read_initial_capacity_csv(path) return process_initial_capacity(df)
def read_initial_capacity_csv(path: Path) -> pd.DataFrame: """Reads and formats data about initial capacity into a DataFrame.""" required_columns = { "region", "technology", } return read_csv( path, required_columns=required_columns, msg=f"Reading initial capacity from {path}.", ) def process_initial_capacity(data: pd.DataFrame) -> xr.DataArray: """Processes initial capacity DataFrame into an xarray DataArray.""" # Drop unit column if present if "unit" in data.columns: data = data.drop(columns=["unit"]) # Select year columns year_columns = [col for col in data.columns if col.isdigit()] # Convert year columns to long format (i.e. single "year" column) data = data.melt( id_vars=["technology", "region"], value_vars=year_columns, var_name="year", value_name="value", ) # Create multiindex for region, technology, and year data = create_multiindex( data, index_columns=["technology", "region", "year"], index_names=["technology", "region", "year"], drop_columns=True, ) # Create Dataarray result = create_xarray_dataset(data).value.astype(float) # Create assets result = create_assets(result) return result
[docs] def read_global_commodities(path: Path) -> xr.Dataset: """Reads and processes global commodities data from a CSV file.""" df = read_global_commodities_csv(path) return process_global_commodities(df)
def read_global_commodities_csv(path: Path) -> pd.DataFrame: """Reads commodities information from input into a DataFrame.""" # Due to legacy reasons, users can supply both Commodity and CommodityName columns # In this case, we need to remove the Commodity column to avoid conflicts # This is fine because Commodity just contains a long description that isn't needed getLogger(__name__).info(f"Reading global commodities from {path}.") df = pd.read_csv(path) df = df.rename(columns=camel_to_snake) if "commodity" in df.columns and "commodity_name" in df.columns: df = df.drop(columns=["commodity"]) required_columns = { "commodity", "commodity_type", } data = standardize_dataframe( df, required_columns=required_columns, ) # Raise warning if units are not defined if "unit" not in data.columns: msg = ( "No units defined for commodities. Please define units for all commodities " "in the global commodities file." ) getLogger(__name__).warning(msg) return data def process_global_commodities(data: pd.DataFrame) -> xr.Dataset: """Processes global commodities DataFrame into an xarray Dataset.""" # Drop description column if present. It's useful to include in the file, but we # don't need it for the simulation. if "description" in data.columns: data = data.drop(columns=["description"]) data.index = [u for u in data.commodity] data = data.drop("commodity", axis=1) data.index.name = "commodity" return create_xarray_dataset(data)
[docs] def read_agent_parameters(path: Path) -> pd.DataFrame: """Reads and processes agent parameters from a CSV file.""" df = read_agent_parameters_csv(path) return process_agent_parameters(df)
def read_agent_parameters_csv(path: Path) -> pd.DataFrame: """Reads standard MUSE agent-declaration csv-files into a DataFrame.""" required_columns = { "search_rule", "quantity", "region", "type", "name", "agent_share", "decision_method", } data = read_csv( path, required_columns=required_columns, msg=f"Reading agent parameters from {path}.", ) # Check for deprecated retrofit agents if "type" in data.columns: retrofit_agents = data[data.type.str.lower().isin(["retrofit", "retro"])] if not retrofit_agents.empty: msg = ( "Retrofit agents will be deprecated in a future release. " "Please modify your model to use only agents of the 'New' type." ) getLogger(__name__).warning(msg) # Legacy: drop AgentNumber column if "agent_number" in data.columns: data = data.drop(["agent_number"], axis=1) # Check consistency of objectives data columns objectives = [col for col in data.columns if col.startswith("objective")] floats = [col for col in data.columns if col.startswith("obj_data")] sorting = [col for col in data.columns if col.startswith("obj_sort")] if len(objectives) != len(floats) or len(objectives) != len(sorting): raise ValueError( "Agent objective, obj_data, and obj_sort columns are inconsistent in " f"{path}" ) return data def process_agent_parameters(data: pd.DataFrame) -> list[dict]: """Processes agent parameters DataFrame into a list of agent dictionaries.""" result = [] for _, row in data.iterrows(): # Get objectives data objectives = ( row[[i.startswith("objective") for i in row.index]].dropna().to_list() ) sorting = row[[i.startswith("obj_sort") for i in row.index]].dropna().to_list() floats = row[[i.startswith("obj_data") for i in row.index]].dropna().to_list() # Create decision parameters decision_params = list(zip(objectives, sorting, floats)) agent_type = { "new": "newcapa", "newcapa": "newcapa", "retrofit": "retrofit", "retro": "retrofit", "agent": "agent", "default": "agent", }[getattr(row, "type", "agent").lower()] # Create agent data dictionary data = { "name": row["name"], "region": row.region, "objectives": objectives, "search_rules": row.search_rule, "decision": {"name": row.decision_method, "parameters": decision_params}, "agent_type": agent_type, "quantity": row.quantity, "share": row.agent_share, } # Add optional parameters if hasattr(row, "maturity_threshold"): data["maturity_threshold"] = row.maturity_threshold if hasattr(row, "spend_limit"): data["spend_limit"] = row.spend_limit # Add agent data to result result.append(data) return result
[docs] def read_initial_market( projections_path: Path, base_year_import_path: Path | None = None, base_year_export_path: Path | None = None, currency: str | None = None, ) -> xr.Dataset: """Reads and processes initial market data. Args: projections_path: path to the projections file base_year_import_path: path to the base year import file (optional) base_year_export_path: path to the base year export file (optional) currency: currency string (e.g. "USD") Returns: xr.Dataset: Dataset containing initial market data. """ # Read projections projections_df = read_projections_csv(projections_path) # Read base year export (optional) if base_year_export_path: export_df = read_csv( base_year_export_path, msg=f"Reading base year export from {base_year_export_path}.", ) else: export_df = None # Read base year import (optional) if base_year_import_path: import_df = read_csv( base_year_import_path, msg=f"Reading base year import from {base_year_import_path}.", ) else: import_df = None # Assemble into xarray Dataset result = process_initial_market(projections_df, import_df, export_df, currency) return result
def read_projections_csv(path: Path) -> pd.DataFrame: """Reads projections data from a CSV file.""" required_columns = { "region", "attribute", "year", } projections_df = read_csv( path, required_columns=required_columns, msg=f"Reading projections from {path}." ) return projections_df def process_initial_market( projections_df: pd.DataFrame, import_df: pd.DataFrame | None, export_df: pd.DataFrame | None, currency: str | None = None, ) -> xr.Dataset: """Process market data DataFrames into an xarray Dataset. Args: projections_df: DataFrame containing projections data import_df: Optional DataFrame containing import data export_df: Optional DataFrame containing export data currency: Currency string (e.g. "USD") """ from muse.commodities import COMMODITIES from muse.timeslices import broadcast_timeslice, distribute_timeslice # Process projections projections = process_attribute_table(projections_df).commodity_price.astype( "float64" ) # Process optional trade data if export_df is not None: base_year_export = process_attribute_table(export_df).exports.astype("float64") else: base_year_export = xr.zeros_like(projections) if import_df is not None: base_year_import = process_attribute_table(import_df).imports.astype("float64") else: base_year_import = xr.zeros_like(projections) # Distribute data over timeslices projections = broadcast_timeslice(projections, level=None) base_year_export = distribute_timeslice(base_year_export, level=None) base_year_import = distribute_timeslice(base_year_import, level=None) # Assemble into xarray result = xr.Dataset( { "prices": projections, "exports": base_year_export, "imports": base_year_import, "static_trade": base_year_import - base_year_export, } ) # Check commodities result = check_commodities(result, fill_missing=True, fill_value=0) # Add units_prices coordinate # Only added if the currency is specified and commodity units are defined if currency and "unit" in COMMODITIES.data_vars: units_prices = [ f"{currency}/{COMMODITIES.sel(commodity=c).unit.item()}" for c in result.commodity.values ] result = result.assign_coords(units_prices=("commodity", units_prices)) return result
[docs] def read_attribute_table(path: Path) -> xr.Dataset: """Reads and processes attribute table data from a CSV file.""" df = read_attribute_table_csv(path) return process_attribute_table(df)
def read_attribute_table_csv(path: Path) -> pd.DataFrame: """Read a standard MUSE csv file for price projections into a DataFrame.""" table = read_csv( path, required_columns=["region", "attribute", "year"], msg=f"Reading attribute table from {path}.", ) return table def process_attribute_table(data: pd.DataFrame) -> xr.Dataset: """Process attribute table DataFrame into an xarray Dataset.""" # Extract commodity columns commodities = [ col for col in data.columns if col not in ["region", "year", "attribute"] ] # Convert commodity columns to long format (i.e. single "commodity" column) data = data.melt( id_vars=["region", "year", "attribute"], value_vars=commodities, var_name="commodity", value_name="value", ) # Pivot data over attributes data = data.pivot( index=["region", "year", "commodity"], columns="attribute", values="value", ) # Create DataSet result = create_xarray_dataset(data) return result
[docs] def read_presets(presets_paths: Path) -> xr.Dataset: """Reads and processes preset data from multiple CSV files. Accepts a path pattern for presets files, e.g. `Path("path/to/*Consumption.csv")`. The file name of each file must contain a year (e.g. "2020Consumption.csv"). """ from glob import glob from re import match # Find all files matching the path pattern allfiles = [Path(p) for p in glob(str(presets_paths))] if len(allfiles) == 0: raise OSError(f"No files found with paths {presets_paths}") # Read all files datas: dict[int, pd.DataFrame] = {} for path in allfiles: # Extract year from filename reyear = match(r"\S*.(\d{4})\S*\.csv", path.name) if reyear is None: raise OSError(f"Unexpected filename {path.name}") year = int(reyear.group(1)) if year in datas: raise OSError(f"Year f{year} was found twice") # Read data data = read_presets_csv(path) data["year"] = year datas[year] = data # Process data datas = process_presets(datas) return datas
def read_presets_csv(path: Path) -> pd.DataFrame: data = read_csv( path, required_columns=["region", "timeslice"], msg=f"Reading presets from {path}.", ) # Legacy: drop technology column and sum data (PR #448) if "technology" in data.columns: getLogger(__name__).warning( f"The technology (or ProcessName) column in file {path} is " "deprecated. Data has been summed across technologies, and this column " "has been dropped." ) data = ( data.drop(columns=["technology"]) .groupby(["region", "timeslice"]) .sum() .reset_index() ) return data def process_presets(datas: dict[int, pd.DataFrame]) -> xr.Dataset: """Processes preset DataFrames into an xarray Dataset.""" from muse.commodities import COMMODITIES from muse.timeslices import TIMESLICE # Combine into a single DataFrame data = pd.concat(datas.values()) # Extract commodity columns commodities = [c for c in data.columns if c in COMMODITIES.commodity.values] # Convert commodity columns to long format (i.e. single "commodity" column) data = data.melt( id_vars=["region", "year", "timeslice"], value_vars=commodities, var_name="commodity", value_name="value", ) # Create multiindex for region, year, timeslice and commodity data = create_multiindex( data, index_columns=["region", "year", "timeslice", "commodity"], index_names=["region", "year", "timeslice", "commodity"], drop_columns=True, ) # Create DataArray result = create_xarray_dataset(data).value.astype(float) # Assign timeslices result = result.assign_coords(timeslice=TIMESLICE.timeslice) # Check commodities result = check_commodities(result, fill_missing=True, fill_value=0) return result
[docs] def read_trade_technodata(path: Path) -> xr.Dataset: """Reads and processes trade technodata from a CSV file.""" df = read_trade_technodata_csv(path) return process_trade_technodata(df)
def read_trade_technodata_csv(path: Path) -> pd.DataFrame: required_columns = {"technology", "region", "parameter"} return read_csv( path, required_columns=required_columns, msg=f"Reading trade technodata from {path}.", ) def process_trade_technodata(data: pd.DataFrame) -> xr.Dataset: # Drop unit column if present if "unit" in data.columns: data = data.drop(columns=["unit"]) # Select region columns # TODO: this is a bit unsafe as user could supply other columns regions = [ col for col in data.columns if col not in ["technology", "region", "parameter"] ] # Melt data over regions data = data.melt( id_vars=["technology", "region", "parameter"], value_vars=regions, var_name="dst_region", value_name="value", ) # Pivot data over parameters data = data.pivot( index=["technology", "region", "dst_region"], columns="parameter", values="value", ) # Create DataSet return create_xarray_dataset(data)
[docs] def read_existing_trade(path: Path) -> xr.DataArray: """Reads and processes existing trade data from a CSV file.""" df = read_existing_trade_csv(path) return process_existing_trade(df)
def read_existing_trade_csv(path: Path) -> pd.DataFrame: required_columns = { "region", "technology", "year", } return read_csv( path, required_columns=required_columns, msg=f"Reading existing trade from {path}.", ) def process_existing_trade(data: pd.DataFrame) -> xr.DataArray: # Select region columns # TODO: this is a bit unsafe as user could supply other columns regions = [ col for col in data.columns if col not in ["technology", "region", "year"] ] # Melt data over regions data = data.melt( id_vars=["technology", "region", "year"], value_vars=regions, var_name="dst_region", value_name="value", ) # Create multiindex for region, dst_region, technology and year data = create_multiindex( data, index_columns=["region", "dst_region", "technology", "year"], index_names=["region", "dst_region", "technology", "year"], drop_columns=True, ) # Create DataArray result = create_xarray_dataset(data).value.astype(float) # Create assets from technologies result = create_assets(result) return result
[docs] def read_timeslice_shares(path: Path) -> xr.DataArray: """Reads and processes timeslice shares data from a CSV file.""" df = read_timeslice_shares_csv(path) return process_timeslice_shares(df)
def read_timeslice_shares_csv(path: Path) -> pd.DataFrame: """Reads sliceshare information into a DataFrame.""" data = read_csv( path, required_columns=["region", "timeslice"], msg=f"Reading timeslice shares from {path}.", ) return data def process_timeslice_shares(data: pd.DataFrame) -> xr.DataArray: """Processes timeslice shares DataFrame into an xarray DataArray.""" from muse.commodities import COMMODITIES from muse.timeslices import TIMESLICE # Extract commodity columns commodities = [c for c in data.columns if c in COMMODITIES.commodity.values] # Convert commodity columns to long format (i.e. single "commodity" column) data = data.melt( id_vars=["region", "timeslice"], value_vars=commodities, var_name="commodity", value_name="value", ) # Create multiindex for region and timeslice data = create_multiindex( data, index_columns=["region", "timeslice", "commodity"], index_names=["region", "timeslice", "commodity"], drop_columns=True, ) # Create DataArray result = create_xarray_dataset(data).value.astype(float) # Assign timeslices result = result.assign_coords(timeslice=TIMESLICE.timeslice) # Check commodities result = check_commodities(result, fill_missing=True, fill_value=0) return result
[docs] def read_macro_drivers(path: Path) -> pd.DataFrame: """Reads and processes macro drivers data from a CSV file.""" df = read_macro_drivers_csv(path) return process_macro_drivers(df)
def read_macro_drivers_csv(path: Path) -> pd.DataFrame: """Reads a standard MUSE csv file for macro drivers into a DataFrame.""" table = read_csv( path, required_columns=["region", "variable"], msg=f"Reading macro drivers from {path}.", ) # Validate required variables required_variables = ["Population", "GDP|PPP"] missing_variables = [ var for var in required_variables if var not in table.variable.unique() ] if missing_variables: raise ValueError(f"Missing required variables in {path}: {missing_variables}") return table def process_macro_drivers(data: pd.DataFrame) -> xr.Dataset: """Processes macro drivers DataFrame into an xarray Dataset.""" # Drop unit column if present if "unit" in data.columns: data = data.drop(columns=["unit"]) # Select year columns year_columns = [col for col in data.columns if col.isdigit()] # Convert year columns to long format (i.e. single "year" column) data = data.melt( id_vars=["variable", "region"], value_vars=year_columns, var_name="year", value_name="value", ) # Pivot data to create Population and GDP|PPP columns data = data.pivot( index=["region", "year"], columns="variable", values="value", ) # Legacy: rename Population to population and GDP|PPP to gdp if "Population" in data.columns: data = data.rename(columns={"Population": "population"}) if "GDP|PPP" in data.columns: data = data.rename(columns={"GDP|PPP": "gdp"}) # Create DataSet result = create_xarray_dataset(data) return result
[docs] def read_regression_parameters(path: Path) -> xr.Dataset: """Reads and processes regression parameters from a CSV file.""" df = read_regression_parameters_csv(path) return process_regression_parameters(df)
def read_regression_parameters_csv(path: Path) -> pd.DataFrame: """Reads the regression parameters from a MUSE csv file into a DataFrame.""" table = read_csv( path, required_columns=["region", "function_type", "coeff"], msg=f"Reading regression parameters from {path}.", ) # Legacy: warn about "sector" column if "sector" in table.columns: getLogger(__name__).warning( f"The sector column (in file {path}) is deprecated. Please remove." ) return table def process_regression_parameters(data: pd.DataFrame) -> xr.Dataset: """Processes regression parameters DataFrame into an xarray Dataset.""" from muse.commodities import COMMODITIES # Extract commodity columns commodities = [c for c in data.columns if c in COMMODITIES.commodity.values] # Melt to long format melted = data.melt( id_vars=["sector", "region", "function_type", "coeff"], value_vars=commodities, var_name="commodity", value_name="value", ) # Extract sector -> function_type mapping sector_to_ftype = melted.drop_duplicates(["sector", "function_type"])[ ["sector", "function_type"] ].set_index("sector")["function_type"] # Pivot to create coefficient variables pivoted = melted.pivot_table( index=["sector", "region", "commodity"], columns="coeff", values="value" ) # Create dataset and add function_type result = create_xarray_dataset(pivoted) result["function_type"] = xr.DataArray( sector_to_ftype[result.sector.values].astype(object), dims=["sector"], name="function_type", ) # Check commodities result = check_commodities(result, fill_missing=True, fill_value=0) return result def check_utilization_and_minimum_service_factors(data: xr.Dataset) -> None: """Check utilization and minimum service factors in an xarray dataset. Args: data: xarray Dataset containing utilization_factor and minimum_service_factor """ if "utilization_factor" not in data.data_vars: raise ValueError( "A technology needs to have a utilization factor defined for every " "timeslice." ) # Check UF not all zero (sum across timeslice dimension if it exists) if "timeslice" in data.dims: utilization_sum = data.utilization_factor.sum(dim="timeslice") else: utilization_sum = data.utilization_factor if (utilization_sum == 0).any(): raise ValueError( "A technology can not have a utilization factor of 0 for every timeslice." ) # Check UF in range utilization = data.utilization_factor if not ((utilization >= 0) & (utilization <= 1)).all(): raise ValueError( "Utilization factor values must all be between 0 and 1 inclusive." ) # Check MSF in range min_service_factor = data.minimum_service_factor if not ((min_service_factor >= 0) & (min_service_factor <= 1)).all(): raise ValueError( "Minimum service factor values must all be between 0 and 1 inclusive." ) # Check UF not below MSF if (data.utilization_factor < data.minimum_service_factor).any(): raise ValueError( "Utilization factors must all be greater than or equal " "to their corresponding minimum service factors." )