Module arvados.http_to_keep

Functions

def http_to_keep(api,
project_uuid,
url,
utcnow=<built-in method utcnow of type object>,
varying_url_params='',
prefer_cached_downloads=False)
Expand source code
def http_to_keep(api, project_uuid, url,
                 utcnow=datetime.datetime.utcnow, varying_url_params="",
                 prefer_cached_downloads=False):
    """Download a file over HTTP and upload it to keep, with HTTP headers as metadata.

    Before downloading the URL, checks to see if the URL already
    exists in Keep and applies HTTP caching policy, the
    varying_url_params and prefer_cached_downloads flags in order to
    decide whether to use the version in Keep or re-download it.
    """

    logger.info("Checking Keep for %s", url)

    varying_params = [s.strip() for s in varying_url_params.split(",")]

    parsed = urllib.parse.urlparse(url)
    query = [q for q in urllib.parse.parse_qsl(parsed.query)
             if q[0] not in varying_params]

    clean_url = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params,
                                         urllib.parse.urlencode(query, safe="/"),  parsed.fragment))

    r1 = api.collections().list(filters=[["properties", "exists", url]]).execute()

    if clean_url == url:
        items = r1["items"]
    else:
        r2 = api.collections().list(filters=[["properties", "exists", clean_url]]).execute()
        items = r1["items"] + r2["items"]

    now = utcnow()

    etags = {}

    curldownloader = _Downloader(api)

    for item in items:
        properties = item["properties"]

        if clean_url in properties:
            cache_url = clean_url
        elif url in properties:
            cache_url = url
        else:
            raise Exception("Shouldn't happen, got an API result for %s that doesn't have the URL in properties" % item["uuid"])

        if prefer_cached_downloads or _fresh_cache(cache_url, properties, now):
            # HTTP caching rules say we should use the cache
            cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
            return (item["portable_data_hash"], next(iter(cr.keys())) )

        if not _changed(cache_url, clean_url, properties, now, curldownloader):
            # Etag didn't change, same content, just update headers
            api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute()
            cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
            return (item["portable_data_hash"], next(iter(cr.keys())))

        for etagstr in ("Etag", "ETag"):
            if etagstr in properties[cache_url] and len(properties[cache_url][etagstr]) > 2:
                etags[properties[cache_url][etagstr]] = item

    logger.debug("Found ETag values %s", etags)

    properties = {}
    headers = {}
    if etags:
        headers['If-None-Match'] = ', '.join([_etag_quote(k) for k,v in etags.items()])
    logger.debug("Sending GET request with headers %s", headers)

    logger.info("Beginning download of %s", url)

    req = curldownloader.download(url, headers)

    c = curldownloader.collection

    if req.status_code not in (200, 304):
        raise Exception("Failed to download '%s' got status %s " % (url, req.status_code))

    if curldownloader.target is not None:
        curldownloader.target.close()

    _remember_headers(clean_url, properties, req.headers, now)

    if req.status_code == 304 and "Etag" in req.headers and req.headers["Etag"] in etags:
        item = etags[req.headers["Etag"]]
        item["properties"].update(properties)
        api.collections().update(uuid=item["uuid"], body={"collection":{"properties": item["properties"]}}).execute()
        cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api)
        return (item["portable_data_hash"], list(cr.keys())[0])

    logger.info("Download complete")

    collectionname = "Downloaded from %s" % urllib.parse.quote(clean_url, safe='')

    # max length - space to add a timestamp used by ensure_unique_name
    max_name_len = 254 - 28

    if len(collectionname) > max_name_len:
        over = len(collectionname) - max_name_len
        split = int(max_name_len/2)
        collectionname = collectionname[0:split] + "…" + collectionname[split+over:]

    c.save_new(name=collectionname, owner_uuid=project_uuid, ensure_unique_name=True)

    api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}).execute()

    return (c.portable_data_hash(), curldownloader.name)

Download a file over HTTP and upload it to keep, with HTTP headers as metadata.

Before downloading the URL, checks to see if the URL already exists in Keep and applies HTTP caching policy, the varying_url_params and prefer_cached_downloads flags in order to decide whether to use the version in Keep or re-download it.