Source code for gpm.io.ges_disc

# -----------------------------------------------------------------------------.
# MIT License

# Copyright (c) 2024 GPM-API developers
#
# This file is part of GPM-API.

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# -----------------------------------------------------------------------------.
"""Routines required to search data on the NASA GES DISC servers."""

import datetime
import re
import shlex
import subprocess

from gpm.io.products import get_product_info, is_trmm_product

###---------------------------------------------------------------------------.
###########################
#### GES DISC scraping ####
###########################


def _get_ges_disc_url_content(url):
    # cmd = f"wget -O - {url}"
    cmd = f"curl -L {url}"
    list_cmd = shlex.split(cmd)
    process = subprocess.Popen(list_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout = process.communicate()[0].decode()
    # Check if server is available
    if stdout == "":
        raise ValueError(f"The requested url {url} was not found on the GES DISC server.")
    if "The requested URL was not found on this server" in stdout:
        raise ValueError(f"The requested url {url} was not found on the GES DISC server.")
    return stdout


def _get_href_value(input_string):
    """Infer href value."""
    match = re.search(r'<a\s+href="([^"]+)"', input_string)
    # Check if a match was found and extract the value
    href_value = match.group(1) if match else ""
    # Exclude .xml files and doc directory
    if ".xml" in href_value or "doc/" in href_value:
        href_value = ""
    return href_value


def _get_ges_disc_list_path(url):
    # Retrieve url content
    # - If it returns something, means url is correct
    output = _get_ges_disc_url_content(url)
    # Retrieve content
    list_content = [_get_href_value(s) for s in output.split("alt=")[4:]]
    list_content = [s for s in list_content if s != ""]
    if len(list_content) == 0:
        raise ValueError(f"The GES DISC {url} directory is empty.")
    return [f"{url}/{s}" for s in list_content]


####--------------------------------------------------------------------------.
#####################
#### Directories ####
#####################


def _get_ges_disc_server(product):
    # TRMM
    if is_trmm_product(product):
        ges_disc_base_url = "https://disc2.gesdisc.eosdis.nasa.gov/data"

    # GPM
    else:
        # ges_disc_base_url = "https://gpm1.gesdisc.eosdis.nasa.gov/data"
        ges_disc_base_url = "https://gpm2.gesdisc.eosdis.nasa.gov/data"
    return ges_disc_base_url


def _get_ges_disc_product_folder_name(product, version):
    dir_pattern = get_product_info(product)["ges_disc_dir"]
    return f"{dir_pattern}.0{version}"


[docs] def get_ges_disc_product_directory_tree(product, date, version): """Return the GES DISC product directory tree. The directory tree structure is - ``<product directory>/YYYY/DOY`` for L1 and L2 products (and IMERG half hourly) - ``<product directory>/YYYY/MM`` for L3 daily products - ``<product directory>/YYYY`` or ``<product directory>/YYYY/MM`` for L3 monthly products Parameters ---------- product : str GPM product name. See ``gpm.available_products()``. date : datetime.date Single date for which to retrieve the data. version : int GPM version of the data to retrieve. Returns ------- directory_tree : str DIrectory tree on the NASA GESC DISC server where the data are stored. """ # Retrieve foldername folder_name = _get_ges_disc_product_folder_name(product, version) # Specify the directory tree # --> TODO: currently specified only for L1 and L2 return "/".join( [ folder_name, datetime.datetime.strftime(date, "%Y/%j"), ], )
[docs] def get_ges_disc_product_directory(product, date, version): """Retrieve the NASA GES DISC server product directory path at a specific date. The data list is retrieved using https. Parameters ---------- product : str GPM product name. See ``gpm.available_products()``. date : datetime.date Single date for which to retrieve the data. version : int GPM version of the data to retrieve. Returns ------- url_data_list : str url of the NASA GES DISC server where the data are stored. """ # Retrieve server URL url_server = _get_ges_disc_server(product) # Retrieve directory tree structure dir_structure = get_ges_disc_product_directory_tree(product=product, date=date, version=version) # Define product directory where data are listed return f"{url_server}/{dir_structure}"
####--------------------------------------------------------------------------. ############################ #### Filepath Retrieval #### ############################ def _get_ges_disc_file_list(url_product_dir, product, date, version, verbose=True): """Retrieve NASA GES DISC filepaths for a specific day and product. The query is done using https ! The function does return the full GES DISC url file paths. The returned file paths refers to a single product !!! Parameters ---------- url_product_dir : str The GES DISC product directory url. product : str GPM product acronym. See ``gpm.available_products()``. date : datetime Single date for which to retrieve the data. verbose : bool, optional Default is ``False``. Whether to specify when data are not available for a specific date. """ try: filepaths = _get_ges_disc_list_path(url_product_dir) except Exception as e: # If url not exist, raise an error if "was not found on the GES DISC server" in str(e): raise e # If no filepath (empty directory), print message if verbose=True if verbose: version_str = str(int(version)) msg = f"No data found on GES DISC on date {date} for product {product} (V0{version_str})" print(msg) filepaths = [] return filepaths def _check_gesc_disc_product_type(product, product_type): if product_type == "NRT" and "IMERG" not in product: raise ValueError("The only available NRT products on GES DISC are IMERG-ER and IMERG-FR")
[docs] def get_ges_disc_daily_filepaths(product, product_type, date, version, verbose=True): """Retrieve the NASA GES DISC file paths available at a given date. Parameters ---------- product : str GPM product acronym. See ``gpm.available_products()``. date : datetime.date Single date for which to retrieve the data. product_type : str GPM product type. Not used for GES DISC. version : int GPM version of the data to retrieve. verbose : bool, optional Whether to specify when data are not available for a specific date. The default is ``True``. """ _check_gesc_disc_product_type(product=product, product_type=product_type) # Retrieve server urls of NASA GES DISC url_product_dir = get_ges_disc_product_directory(product=product, date=date, version=version) # Retrieve GES DISC filepaths # - If empty: return [] return _get_ges_disc_file_list( url_product_dir=url_product_dir, product=product, date=date, version=version, verbose=verbose, )
[docs] def define_ges_disc_filepath(product, product_type, date, version, filename): """Define GES DISC filepath from filename. This function is called by ``get_filepath_from_filename(filename, storage, product_type)``. Parameters ---------- product : str GPM product acronym. See ``gpm.available_products()``. product_type : str GPM product type. Not used for GES DISC. date : datetime.date Single date for which to retrieve the data. version : int GPM version of the data to retrieve if ``product_type = "RS"``. filename : str Name of the GPM file. """ _check_gesc_disc_product_type(product=product, product_type=product_type) # Retrieve product directory url url_product_dir = get_ges_disc_product_directory(product=product, date=date, version=version) # Define GES DISC filepath return f"{url_product_dir}/{filename}"