import os
import requests
from solidipes.downloaders.downloader import Downloader
from solidipes.scripts.init import create_solidipes_directory
from solidipes.utils import DataRepositoryException, set_study_metadata
from ..utils.dspace5_utils import check_response, download_files, get_host_and_id
[docs]
class Dspace5Downloader(Downloader):
command = ["dspace5", "ethresearchcollection"]
command_help = "Download study from Dspace5"
[docs]
def download(self, args):
main(args)
[docs]
def populate_arg_parser(self, parser):
parser.description = self.command_help
parser.add_argument("identifier", help="URL or DOI of the study to download")
parser.add_argument(
"destination",
nargs="?",
default="",
help="Path to the destination folder. If not specified, defaults to the study numeric ID.",
)
parser.add_argument(
"--only-metadata",
help="Only download metadata (overrides destination directory's metadata!)",
action="store_true",
)
[docs]
def main(args):
"""Download content from Dspace5"""
from solidipes.utils.metadata import dc_to_solidipes
try:
host, study_id = get_host_and_id(args.identifier)
url = f"https://{host}/rest/api/items/{study_id}/metadata"
# Scan record
response = requests.get(url)
check_response(response, 200, "retrieve record")
response_record = response.json()
record = {}
for x in response_record:
# TODO beware repeatable fields?
record[x["key"]] = x["value"]
print(f"Retrieving study {study_id} from {host}...")
# Create destination folder if it does not exist
if not args.destination:
args.destination = f"{study_id}"
if not os.path.exists(args.destination):
os.makedirs(args.destination)
# Create Solidipes directory if it does not exist
try:
create_solidipes_directory(args.destination)
except FileExistsError:
pass
# Save metadata in YAML file
print("Saving metadata...")
metadata = process_metadata(dc_to_solidipes(record))
metadata["zz_orig_metadata"] = record
metadata["zz_orig_metadata"]["00solidipes_platform"] = "dspace5"
metadata["zz_orig_metadata"]["00solidipes_host"] = host
metadata["zz_orig_metadata"]["00solidipes_study_id"] = study_id
set_study_metadata(metadata, initial_path=args.destination)
if args.only_metadata:
return
url = f"https://{host}/rest/api/items/{study_id}/bitstreams"
response = requests.get(url)
check_response(response, 200, "locate files")
bitstreams = response.json()
# patch download links, they are relative at least in the ETHZ Research Collection
for k in range(len(bitstreams)):
if not bitstreams[k]["retrieveLink"].startswith("http"):
bitstreams[k]["retrieveLink"] = f"https://{host}/rest/api/{bitstreams[k]['retrieveLink']}"
download_files(bitstreams, destination=args.destination, progressbar=True)
except Exception as e:
if type(e) is not DataRepositoryException:
raise e
print(e)
return