Module arvados.http_to_keep
Functions
def http_to_keep(api,
project_uuid,
url,
utcnow=<built-in method utcnow of type object>,
varying_url_params='',
prefer_cached_downloads=False)-
Expand source code
def http_to_keep(api, project_uuid, url, utcnow=datetime.datetime.utcnow, varying_url_params="", prefer_cached_downloads=False): """Download a file over HTTP and upload it to keep, with HTTP headers as metadata. Before downloading the URL, checks to see if the URL already exists in Keep and applies HTTP caching policy, the varying_url_params and prefer_cached_downloads flags in order to decide whether to use the version in Keep or re-download it. """ logger.info("Checking Keep for %s", url) varying_params = [s.strip() for s in varying_url_params.split(",")] parsed = urllib.parse.urlparse(url) query = [q for q in urllib.parse.parse_qsl(parsed.query) if q[0] not in varying_params] clean_url = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urllib.parse.urlencode(query, safe="/"), parsed.fragment)) r1 = api.collections().list(filters=[["properties", "exists", url]]).execute() if clean_url == url: items = r1["items"] else: r2 = api.collections().list(filters=[["properties", "exists", clean_url]]).execute() items = r1["items"] + r2["items"] now = utcnow() etags = {} curldownloader = _Downloader(api) for item in items: properties = item["properties"] if clean_url in properties: cache_url = clean_url elif url in properties: cache_url = url else: raise Exception("Shouldn't happen, got an API result for %s that doesn't have the URL in properties" % item["uuid"]) if prefer_cached_downloads or _fresh_cache(cache_url, properties, now): # HTTP caching rules say we should use the cache cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api) return (item["portable_data_hash"], next(iter(cr.keys())) ) if not _changed(cache_url, clean_url, properties, now, curldownloader): # Etag didn't change, same content, just update headers api.collections().update(uuid=item["uuid"], body={"collection":{"properties": properties}}).execute() cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api) return (item["portable_data_hash"], next(iter(cr.keys()))) for etagstr in ("Etag", "ETag"): if etagstr in properties[cache_url] and len(properties[cache_url][etagstr]) > 2: etags[properties[cache_url][etagstr]] = item logger.debug("Found ETag values %s", etags) properties = {} headers = {} if etags: headers['If-None-Match'] = ', '.join([_etag_quote(k) for k,v in etags.items()]) logger.debug("Sending GET request with headers %s", headers) logger.info("Beginning download of %s", url) req = curldownloader.download(url, headers) c = curldownloader.collection if req.status_code not in (200, 304): raise Exception("Failed to download '%s' got status %s " % (url, req.status_code)) if curldownloader.target is not None: curldownloader.target.close() _remember_headers(clean_url, properties, req.headers, now) if req.status_code == 304 and "Etag" in req.headers and req.headers["Etag"] in etags: item = etags[req.headers["Etag"]] item["properties"].update(properties) api.collections().update(uuid=item["uuid"], body={"collection":{"properties": item["properties"]}}).execute() cr = arvados.collection.CollectionReader(item["portable_data_hash"], api_client=api) return (item["portable_data_hash"], list(cr.keys())[0]) logger.info("Download complete") collectionname = "Downloaded from %s" % urllib.parse.quote(clean_url, safe='') # max length - space to add a timestamp used by ensure_unique_name max_name_len = 254 - 28 if len(collectionname) > max_name_len: over = len(collectionname) - max_name_len split = int(max_name_len/2) collectionname = collectionname[0:split] + "…" + collectionname[split+over:] c.save_new(name=collectionname, owner_uuid=project_uuid, ensure_unique_name=True) api.collections().update(uuid=c.manifest_locator(), body={"collection":{"properties": properties}}).execute() return (c.portable_data_hash(), curldownloader.name)Download a file over HTTP and upload it to keep, with HTTP headers as metadata.
Before downloading the URL, checks to see if the URL already exists in Keep and applies HTTP caching policy, the varying_url_params and prefer_cached_downloads flags in order to decide whether to use the version in Keep or re-download it.