import requests
import netrc
def get_s3_links(collection_concept_id, provider, bounding_box=None,
=None, cycle=None, wildcard=None, edl_token=None):
time_range"""
Fetch S3 links from CMR API based on search criteria.
:param collection_concept_id: The concept ID of the collection to search within.
:param provider: The data provider for the collection.
:param bounding_box: A list of coordinates [min_lon, min_lat, max_lon, max_lat] to filter by bounding box.
:param time_range: A list of two datetime strings [start_time, end_time] to filter by temporal range.
:param cycle: The cycle value to filter by.
:param wildcard: A native_id wildcard pattern to filter granules.
:param edl_token: The EDL token for authentication (optional).
:return: A list of S3 links from the CMR API.
"""
= 'https://cmr.earthdata.nasa.gov'
base_url = '/search/granules.umm_json'
search_endpoint
# Set up query parameters
= {
params 'collection_concept_id': collection_concept_id,
'provider': provider,
'page_size': 2000
}
if bounding_box:
'bounding_box'] = ','.join(map(str, bounding_box))
params[
if time_range:
'temporal'] = ','.join(map(str, time_range))
params[
if cycle:
'cycle'] = cycle
params[
if wildcard:
'options[native_id][pattern]'] = 'true'
params['native_id'] = wildcard
params[
= []
s3_links
= {'cmr-search-after': None}
headers if edl_token:
"Authorization"] = f"Bearer {edl_token}"
headers[
try:
while True:
= requests.get(base_url + search_endpoint, params=params, headers=headers)
response # Check for request errors
response.raise_for_status() = response.json()
response_data = response.headers.get("cmr-search-after")
cmr_search_after
if 'items' not in response_data:
break
for item in response_data['items']:
if 'umm' in item and 'RelatedUrls' in item['umm']:
for url_info in item['umm']['RelatedUrls']:
if url_info['Type'] == 'GET DATA VIA DIRECT ACCESS' and url_info['URL'] and 's3://' in url_info['URL']:
'URL'])
s3_links.append(url_info[
'cmr-search-after'] = cmr_search_after
headers[
if cmr_search_after is None:
break
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return s3_links
CMR search getting S3 Links
This is a tutorial to show how to retrive a list of s3 links from a CMR granules search.
Prerequisites
Before proceeding, ensure you have the following:
Python installed on your system. The requests library installed. If not, you can install it using “pip install requests”
Define function below to get s3 links for granules based on CMR search criterias.
Provide CMR Credentials
You may need authentication for the CMR API (e.g., Earthdata Login), the following function will create an EDL token for you, also you will need to create a .netrc file with your EDL credentials. More information on .netrc files https://everything.curl.dev/usingcurl/netrc
# GET TOKEN FROM CMR
def get_token():
= 'urs.earthdata.nasa.gov'
urs_root = netrc.netrc().authenticators(urs_root)
username, _, password = "https://{}/api/users/tokens".format(urs_root)
token_api = requests.get(token_api, auth=(username, password))
response = response.json()
content if len(content) > 0:
return content[0].get('access_token')
else:
= "https://{}/api/users/token".format(urs_root)
create_token_api = requests.post(create_token_api, auth=(username, password))
response = response.json()
content return content.get('access_token')
= get_token() edl_token
Search Collection Concept id
Search for a collection concept id using CMR API, we will use a collections shortname to find the collection concept id.
= "SWOT_L2_LR_SSH_BASIC_1.0"
collection_short_name = {"Authorization": f"Bearer {edl_token}"}
headers = f"https://cmr.earthdata.nasa.gov/search/collections.json?short_name={collection_short_name}"
cmr_collection_url = requests.get(headers=headers, url=cmr_collection_url)
response = response.json().get('feed').get('entry')[0].get('id') collection_concept_id
Define Search Criteria
Define the search criteria to filter the granules. This can include the collection concept ID, provider, bounding box, time range, cycle, and wildcard (if needed). Wildcard are used to search for granules name that falls within the wildcard regex.
# Example usage:
= 'POCLOUD'
provider = [-90, -90, 90, 90]
bounding_box = ["2023-01-01T00:00:00Z", "2023-12-30T23:59:59Z"]
time_range = "560"
cycle = "*2023*" wildcard
Fetch S3 Links
Now, let’s call the get_s3_links function with the provided search criteria to fetch the S3 links from the CMR API.
= get_s3_links(collection_concept_id, provider,
s3_links =bounding_box, time_range=time_range,
bounding_box=wildcard, edl_token=edl_token, cycle=cycle)
wildcard
print(len(s3_links))
display(s3_links)