"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license. """ import json import logging import re from collections import OrderedDict, defaultdict from typing import Tuple, Union from urllib.parse import urlencode import defusedxml.ElementTree as etree import jsonschema import numpy as np import pandas as pd import xmljson from django.conf import settings from label_studio_sdk._extensions.label_studio_tools.core import label_config from rest_framework.exceptions import ValidationError from label_studio.core.utils.io import find_file logger = logging.getLogger(__name__) _DATA_EXAMPLES = None _LABEL_TAGS = {'Label', 'Choice', 'Relation'} SINGLE_VALUED_TAGS = {'choices': str, 'rating': int, 'number': float, 'textarea': str} _NOT_CONTROL_TAGS = { 'Filter', } # TODO: move configs in right place _LABEL_CONFIG_SCHEMA = find_file('label_config_schema.json') with open(_LABEL_CONFIG_SCHEMA) as f: _LABEL_CONFIG_SCHEMA_DATA = json.load(f) def parse_config(config_string): """ :param config_string: Label config string :return: structured config of the form: { ".name": { "type": "ControlTag", "to_name": [".name", ".name"], "inputs: [ {"type": "ObjectTag1", "value": ".value"}, {"type": "ObjectTag2", "value": ".value"} ], "labels": ["Label1", "Label2", "Label3"] // taken from "alias" if exists or "value" } """ logger.warning('Using deprecated method - switch to label_studio.tools.label_config.parse_config!') return label_config.parse_config(config_string) def _fix_choices(config): """ workaround for single choice https://github.com/HumanSignal/label-studio/issues/1259 """ if 'Choices' in config: # for single Choices tag in View if 'Choice' in config['Choices'] and not isinstance(config['Choices']['Choice'], list): config['Choices']['Choice'] = [config['Choices']['Choice']] # for several Choices tags in View elif isinstance(config['Choices'], list) and all('Choice' in tag_choices for tag_choices in config['Choices']): for n in range(len(config['Choices'])): # check that Choices tag has only 1 choice if not isinstance(config['Choices'][n]['Choice'], list): config['Choices'][n]['Choice'] = [config['Choices'][n]['Choice']] if 'View' in config: if isinstance(config['View'], OrderedDict): config['View'] = _fix_choices(config['View']) else: config['View'] = [_fix_choices(view) for view in config['View']] return config def parse_config_to_xml(config_string: Union[str, None], raise_on_empty: bool = False) -> Union[OrderedDict, None]: if config_string is None: if raise_on_empty: raise TypeError('config_string is None') return None xml = etree.fromstring(config_string, forbid_dtd=True) # Remove comments for comment in xml.findall('.//comment'): comment.getparent().remove(comment) return xml def parse_config_to_json(config_string: Union[str, None]) -> Tuple[Union[OrderedDict, None], Union[str, None]]: try: xml = parse_config_to_xml(config_string, raise_on_empty=True) except TypeError: raise etree.ParseError('can only parse strings') if xml is None: raise etree.ParseError('xml is empty or incorrect') config = xmljson.badgerfish.data(xml) config = _fix_choices(config) return config, etree.tostring(xml, encoding='unicode') def validate_label_config(config_string: Union[str, None]) -> None: # xml and schema try: config, cleaned_config_string = parse_config_to_json(config_string) jsonschema.validate(config, _LABEL_CONFIG_SCHEMA_DATA) except (etree.ParseError, ValueError) as exc: raise ValidationError(str(exc)) except jsonschema.exceptions.ValidationError as exc: # jsonschema4 validation error now includes all errors from "anyOf" subschemas # check https://python-jsonschema.readthedocs.io/en/latest/errors/#jsonschema.exceptions.ValidationError.context # we pick the first failed schema and show only its error message error_message = exc.context[0].message if len(exc.context) else exc.message error_message = 'Validation failed on {}: {}'.format( '/'.join(map(str, exc.path)), error_message.replace('@', '') ) raise ValidationError(error_message) # unique names in config # FIXME: 'name =' (with spaces) won't work all_names = re.findall(r'name="([^"]*)"', cleaned_config_string) if len(set(all_names)) != len(all_names): raise ValidationError('Label config contains non-unique names') # toName points to existent name names = set(all_names) toNames = re.findall(r'toName="([^"]*)"', cleaned_config_string) for toName_ in toNames: for toName in toName_.split(','): if toName not in names: raise ValidationError(f'toName="{toName}" not found in names: {sorted(names)}') def extract_data_types(label_config): # load config xml = parse_config_to_xml(label_config) if xml is None: raise etree.ParseError('Project config is empty or incorrect') # take all tags with values attribute and fit them to tag types data_type = {} parent = xml.findall('.//*[@value]') for match in parent: if not match.get('name'): continue name = match.get('value') # simple one if len(name) > 1 and (name[0] == '$'): name = name[1:] # video has highest priority, e.g. # for