Source code for solidipes.utils.metadata

"""This module must be lazy loaded due to pandas import."""

import os
from typing import Any, Dict

import pandas as pd
from iso639 import Lang

################################################################
# data_licenses

dir_name = os.path.dirname(__file__)
licenses = pd.read_csv(os.path.join(dir_name, "licenses.csv"))
licences_data_or_software = licenses[licenses["domain_data"] | licenses["domain_software"]]
licenses = licenses[["id", "title"]]
licenses = [(d[1]["id"].lower(), d[1]["title"]) for d in licenses.iterrows()]
licences_data_or_software = licences_data_or_software[["id", "title"]]
licences_data_or_software = [(d[1]["id"].lower(), d[1]["title"]) for d in licences_data_or_software.iterrows()]

################################################################
# languages

dir_name = os.path.dirname(__file__)
lang = pd.read_csv(os.path.join(dir_name, "languages-iso-639-2.csv"))
lang["ISO 639-1 Code"] = lang["ISO 639-1 Code"].apply(lambda x: x.strip())
lang = lang[lang["ISO 639-1 Code"] != ""]
lang = lang[["ISO 639-2 Code", "English name of Language", "ISO 639-1 Code"]]
lang = [(d[1]["ISO 639-2 Code"].lower(), d[1]["English name of Language"]) for d in lang.iterrows()]

################################################################
# metadata mappings


[docs] def dc_to_solidipes(dc_metadata: Dict[str, Any]) -> Dict[str, Any]: """ Transform Dublin Core Qualified metadata to solidipes-compliant, solidipes-adjacent metadata. Args: dc_metadata (dict): Input metadata in Dublin Core Qualified format Returns: dict: Transformed metadata in solidipes format """ solidipes_metadata = {} # In Dspace5, the fields are strings and such. # In Dspace7, they are dicts with the actual value stored in a field. test_field = dc_metadata["dc.title"][0] if isinstance(test_field, dict): empty_default = [{"value": ""}] else: empty_default = [""] """ print("\n") for x in sorted(dc_metadata.keys()): print(x) """ print([dc_metadata.get("dc.title", "")]) def get_single_element(x): if isinstance(x, str): return x elif isinstance(x, list): return get_single_element(x[0]) else: return x["value"] def get_all_elements(y): return [get_single_element(x) for x in y] solidipes_metadata["zz_orig_metadata"] = dc_metadata doi_element = dc_metadata.get("dc.identifier.doi", empty_default) solidipes_metadata["doi"] = get_single_element(doi_element) # Transform creators creator_elements = dc_metadata.get("dc.contributor.author", empty_default) if isinstance(creator_elements, dict) or isinstance(creator_elements, str): creator_elements = [creator_elements] solidipes_metadata["creators"] = [{"name": creator} for creator in get_all_elements(creator_elements)] # Transform titles # titles = dc_metadata.get("dc:title", []) # if isinstance(titles, str): # titles = [titles] # solidipes_metadata["titles"] = [{"title": title} for title in titles] title_elements = dc_metadata.get("dc.title", empty_default) solidipes_metadata["title"] = get_single_element(title_elements) # Publisher publisher_elements = dc_metadata.get("dc.publisher", empty_default) solidipes_metadata["publisher"] = get_single_element(publisher_elements) # Publication Year (extract from date) date_element = dc_metadata.get("dc.date.issued", empty_default) if date_element: solidipes_metadata["publication_date"] = get_single_element(date_element) # Subjects (Keywords) subjects = dc_metadata.get("dc.subject", []) subjects += dc_metadata.get("dc.subject.ddc", []) if isinstance(subjects, str): subjects = [subjects] solidipes_metadata["keywords"] = get_all_elements(subjects) # Contributors contributors = dc_metadata.get("dc.contributor", []) if isinstance(contributors, str): contributors = [contributors] solidipes_metadata["contributors"] = [ {"contributorType": "Other", "name": contributor} for contributor in get_all_elements(contributors) ] # Language language_elements = dc_metadata.get("dc.language.iso", empty_default) if language_elements == empty_default: solidipes_metadata["language"] = "eng" else: lang_iso693_1 = Lang(get_single_element(language_elements)) solidipes_metadata["language"] = lang_iso693_1.pt2b # Resource Type upload_type_element = dc_metadata.get("dc.type", empty_default) # Dublin Core will often contain non-standard values standard_dc_types = { "text": "text", "image": "image", "sound": "sound", "dataset": "dataset", "software": "software", "interactive": "dataset", "event": "event", "physical object": "physicalobject", } if upload_type_element == empty_default: solidipes_metadata["upload_type"] = "dataset" elif get_single_element(upload_type_element).lower() not in standard_dc_types: # Map non-standard types to a safe value solidipes_metadata["upload_type"] = "dataset" else: solidipes_metadata["upload_type"] = standard_dc_types[get_single_element(upload_type_element).lower()] version_elements = dc_metadata.get("dc.description.version", empty_default) solidipes_metadata["version"] = get_single_element(version_elements) return solidipes_metadata
[docs] def solidipes_to_dspace7(metadata: Dict[str, Any]) -> Dict[str, Any]: """ Transform solidipes-compliant/solidipes-adjacent metadata to DSpace7 patch-ready metadata, i.e. a list of field insertion operations. Args: metadata (dict): solidipes metadata Returns: dict: Transformed metadata in DSpace7 patch-ready Dublin Core format """ solidipes_dc_types_map = { "text": "text", "image": "image", "sound": "sound", "dataset": "dataset", "software": "software", "event": "event", "physicalobject": "physical object", } d7_metadata = [] empty_default = [""] def map_single_field(solidipes_field, dc_field, mymap=None): solidipes_elements = metadata.get(solidipes_field, empty_default) if isinstance(solidipes_elements, str): solidipes_elements = [solidipes_elements] for idx, x in enumerate(solidipes_elements): if mymap is None: value = x else: try: value = mymap[x] except KeyError: value = x return { "op": "add", "path": f"/metadata/{dc_field}/{idx}", "value": [{"value": value}], } d7_metadata.append(map_single_field("title", "dc.title")) d7_metadata.append(map_single_field("description", "dc.description.abstract")) d7_metadata.append(map_single_field("doi", "dc.identifier.doi")) d7_metadata.append(map_single_field("language", "dc.language.iso")) d7_metadata.append(map_single_field("publication_date", "dc.date.issued")) d7_metadata.append(map_single_field("publisher", "dc.publisher")) d7_metadata.append(map_single_field("keywords", "dc.subject")) d7_metadata.append(map_single_field("type", "dc.type", solidipes_dc_types_map)) # Transform creators creator_elements = metadata.get("creators", empty_default) for idx, x in enumerate(creator_elements): operation = { "op": "add", "path": f"/metadata/dc.contributor.author/{idx}", "value": [{"value": x["name"]}], } d7_metadata.append(operation) # FIXME? Either wrong DC property, or not supported at all """ if "affiliation" in x: d7_metadata.append( { "op": "add", "path": f"/metadata/dc.contributor.affiliation/{idx}", "value": [{"value": x["affiliation"]}], } ) """ return d7_metadata