Source code for solidipes_core_plugin.downloaders.dspace5

import os

import requests
from solidipes.downloaders.downloader import Downloader
from solidipes.scripts.init import create_solidipes_directory
from solidipes.utils import DataRepositoryException, set_study_metadata

from ..utils.dspace5_utils import check_response, download_files, get_host_and_id



[docs]
class Dspace5Downloader(Downloader):
    command = ["dspace5", "ethresearchcollection"]
    command_help = "Download study from Dspace5"


[docs]
    def download(self, args):
        main(args)



[docs]
    def populate_arg_parser(self, parser):
        parser.description = self.command_help

        parser.add_argument("identifier", help="URL or DOI of the study to download")

        parser.add_argument(
            "destination",
            nargs="?",
            default="",
            help="Path to the destination folder. If not specified, defaults to the study numeric ID.",
        )

        parser.add_argument(
            "--only-metadata",
            help="Only download metadata (overrides destination directory's metadata!)",
            action="store_true",
        )





[docs]
def main(args):
    """Download content from Dspace5"""

    from solidipes.utils.metadata import dc_to_solidipes

    try:
        host, study_id = get_host_and_id(args.identifier)
        url = f"https://{host}/rest/api/items/{study_id}/metadata"

        # Scan record
        response = requests.get(url)
        check_response(response, 200, "retrieve record")
        response_record = response.json()
        record = {}

        for x in response_record:
            # TODO beware repeatable fields?
            record[x["key"]] = x["value"]

        print(f"Retrieving study {study_id} from {host}...")

        # Create destination folder if it does not exist
        if not args.destination:
            args.destination = f"{study_id}"
        if not os.path.exists(args.destination):
            os.makedirs(args.destination)

        # Create Solidipes directory if it does not exist
        try:
            create_solidipes_directory(args.destination)
        except FileExistsError:
            pass

        # Save metadata in YAML file
        print("Saving metadata...")

        metadata = process_metadata(dc_to_solidipes(record))
        metadata["zz_orig_metadata"] = record
        metadata["zz_orig_metadata"]["00solidipes_platform"] = "dspace5"
        metadata["zz_orig_metadata"]["00solidipes_host"] = host
        metadata["zz_orig_metadata"]["00solidipes_study_id"] = study_id

        set_study_metadata(metadata, initial_path=args.destination)

        if args.only_metadata:
            return

        url = f"https://{host}/rest/api/items/{study_id}/bitstreams"
        response = requests.get(url)
        check_response(response, 200, "locate files")
        bitstreams = response.json()

        # patch download links, they are relative at least in the ETHZ Research Collection
        for k in range(len(bitstreams)):
            if not bitstreams[k]["retrieveLink"].startswith("http"):
                bitstreams[k]["retrieveLink"] = f"https://{host}/rest/api/{bitstreams[k]['retrieveLink']}"

        download_files(bitstreams, destination=args.destination, progressbar=True)

    except Exception as e:
        if type(e) is not DataRepositoryException:
            raise e

        print(e)
        return




[docs]
def process_metadata(metadata):
    """Process metadata to make dataset uploadable again"""

    # TODO ignoring this for the moment

    if "upload_type" not in metadata:
        if "resource_type" in metadata:
            metadata["upload_type"] = metadata["resource_type"]["type"]
            del metadata["resource_type"]
        else:
            metadata["upload_type"] = "dataset"

    if "journal" in metadata:
        journal = metadata["journal"]
        for field in ["title", "volume", "issue", "pages"]:
            if field in journal:
                metadata[f"journal_{field}"] = journal[field]
        del metadata["journal"]

    if "license" in metadata:
        license_type = metadata["license"].get("id")
        if license_type:
            metadata["license"] = license_type.lower()
        else:
            del metadata["license"]

    related_identifiers = metadata.get("related_identifiers", [])
    for related in related_identifiers:
        if related.get("relation") == "isVersionOf":
            related_identifiers.remove(related)

    return metadata