Source code for gpm.utils.dataframe
# -----------------------------------------------------------------------------.
# MIT License
# Copyright (c) 2024 GPM-API developers
#
# This file is part of GPM-API.
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# -----------------------------------------------------------------------------.
"""This module contains general utility to convert xarray objects to dataframes."""
import dask
from gpm.dataset.granule import remove_unused_var_dims
from gpm.utils.xarray import ensure_unique_chunking
[docs]
def get_df_object_columns(df):
"""Get the dataframe columns which have 'object' type."""
return list(df.select_dtypes(include="object").columns)
[docs]
def ensure_pyarrow_string_columns(df):
"""Convert 'object' type columns to pyarrow strings."""
for column in get_df_object_columns(df):
df[column] = df[column].astype("string[pyarrow]")
return df
[docs]
def drop_undesired_columns(df):
"""Drop undesired columns like dataset dimensions without coordinates."""
undesired_columns = ["cross_track", "along_track", "range", "beam", "pixel", "crsWGS84"]
undesired_columns = [column for column in undesired_columns if column in df.columns]
return df.drop(columns=undesired_columns)
[docs]
def to_pandas_dataframe(ds):
"""Convert an xarray.Dataset to a :py:class:`pandas.DataFrame`."""
# Drop unrelevant coordinates
ds = remove_unused_var_dims(ds)
# Convert to pandas dataframe
# - strings are converted to object !
df = ds.to_dataframe(dim_order=None)
# Convert object columns to pyarrow string
df = ensure_pyarrow_string_columns(df)
# Remove MultiIndex
df = df.reset_index(drop=True)
# Drop unrequired columns (previous dataset dimensions)
return drop_undesired_columns(df)
[docs]
def to_dask_dataframe(ds):
"""Convert an xarray.Dataset to a :py:class:`dask.dataframe.DataFrame`."""
# Drop unrelevant coordinates
ds = remove_unused_var_dims(ds)
# Check dataset uniform chunking
ds = ensure_unique_chunking(ds)
# Convert to to dask dataframe
# - strings are converted to object !
with dask.config.set(**{"array.slicing.split_large_chunks": True}):
df = ds.to_dask_dataframe(dim_order=None, set_index=False)
# Convert object columns to pyarrow string
df = ensure_pyarrow_string_columns(df)
# Drop unrequired columns (previous dataset dimensions)
return drop_undesired_columns(df)