"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license. """ import logging from functools import reduce from operator import getitem from urllib.parse import urlparse import ujson as json from core.label_config import replace_task_data_undefined_with_config_field from rest_framework.exceptions import ValidationError class SkipField(Exception): pass _DATA_TYPES = { 'Text': [str, int, float, list], 'Header': [str, int, float], 'HyperText': [str], 'Image': [str, list], 'Paragraphs': [list, str], 'Chat': [list, str], 'Table': [dict, list, str], 'TimeSeries': [dict, list, str], 'TimeSeriesChannel': [dict, list, str], 'List': [list, str], 'Choices': [str, list], 'PolygonLabels': [str, list], 'Labels': [str, list], 'BrushLabels': [str, list], 'EllipseLabels': [str, list], 'HyperTextLabels': [str, list], 'KeyPointLabels': [str, list], 'ParagraphLabels': [str, list], 'RectangleLabels': [str, list], 'TimeSeriesLabels': [str, list], 'Taxonomy': [str, list, type(None)], 'Ranker': [list, str], } logger = logging.getLogger(__name__) class TaskValidator: """Task Validator with project scheme configs validation. It is equal to TaskSerializer from django backend.""" def __init__(self, project, instance=None): self.project = project self.instance = instance self.annotation_count = 0 self.prediction_count = 0 @staticmethod def check_data(project, data): """Validate data from task['data']""" if data is None: raise ValidationError('Task is empty (None)') replace_task_data_undefined_with_config_field(data, project) # iterate over data types from project for data_key, data_type in project.data_types.items(): # get array name in case of Repeater tag is_array = '[' in data_key data_key = data_key.split('[')[0] if '.' in data_key: keys = data_key.split('.') try: data_item = reduce(getitem, keys, data) except KeyError: raise ValidationError('"{data_key}" key is expected in task data'.format(data_key=data_key)) else: if data_key not in data: raise ValidationError('"{data_key}" key is expected in task data'.format(data_key=data_key)) data_item = data[data_key] if is_array: expected_types = (list,) else: expected_types = _DATA_TYPES.get(data_type, (str,)) if not isinstance(data_item, tuple(expected_types)): raise ValidationError( "data['{data_key}']={data_value} is of type '{type}', " 'but the object tag {data_type} expects the following types: {expected_types}'.format( data_key=data_key, data_value=data_item, type=type(data_item).__name__, data_type=data_type, expected_types=[e.__name__ for e in expected_types], ) ) return data @staticmethod def check_data_and_root(project, data, dict_is_root=False): """Check data consistent and data is dict with task or dict['task'] is task :param project: :param data: :param dict_is_root: :return: """ try: TaskValidator.check_data(project, data) except ValidationError as e: if dict_is_root: raise ValidationError(e.detail[0] + ' [assume: item as is = task root with values] ') else: raise ValidationError(e.detail[0] + ' [assume: item["data"] = task root with values]') @staticmethod def check_allowed(task): # task is required if 'data' not in task: return False # everything is ok return True @staticmethod def raise_if_wrong_class(task, key, class_def): if key in task and not isinstance(task[key], class_def): if isinstance(class_def, tuple): class_def = ' or '.join([c.__name__ for c in class_def]) else: class_def = class_def.__name__ raise ValidationError('Task[{key}] must be {class_def}'.format(key=key, class_def=class_def)) def validate(self, task): """Validate whole task with task['data'] and task['annotations']. task['predictions']""" # task is class if hasattr(task, 'data'): self.check_data_and_root(self.project, task.data) return task # self.instance is loaded by get_object of view if self.instance and hasattr(self.instance, 'data'): if isinstance(self.instance.data, dict): data = self.instance.data elif isinstance(self.instance.data, str): try: data = json.loads(self.instance.data) except ValueError as e: raise ValidationError("Can't parse task data: " + str(e)) else: raise ValidationError( 'Field "data" must be string or dict, but not "' + type(self.instance.data) + '"' ) self.check_data_and_root(self.instance.project, data) return task # check task is dict if not isinstance(task, dict): raise ValidationError('Task root must be dict with "data", "meta", "annotations", "predictions" fields') # task[data] | task[annotations] | task[predictions] | task[meta] if self.check_allowed(task): # task[data] self.raise_if_wrong_class(task, 'data', (dict, list)) self.check_data_and_root(self.project, task['data']) # task[annotations]: we can't use AnnotationSerializer for validation # because it's much different with validation we need here self.raise_if_wrong_class(task, 'annotations', list) for annotation in task.get('annotations', []): if not isinstance(annotation, dict): logger.warning('Annotation must be dict, but "%s" found', str(type(annotation))) continue ok = 'result' in annotation if not ok: raise ValidationError('Annotation must have "result" fields') # check result is list if not isinstance(annotation.get('result', []), list): raise ValidationError('"result" field in annotation must be list') # task[predictions] self.raise_if_wrong_class(task, 'predictions', list) for prediction in task.get('predictions', []): if not isinstance(prediction, dict): logger.warning('Prediction must be dict, but "%s" found', str(type(prediction))) continue ok = 'result' in prediction if not ok: raise ValidationError('Prediction must have "result" fields') # task[meta] self.raise_if_wrong_class(task, 'meta', (dict, list)) # task is data as is, validate task as data and move it to task['data'] else: self.check_data_and_root(self.project, task, dict_is_root=True) task = {'data': task} return task @staticmethod def format_error(i, detail, item): if len(detail) == 1: code = (str(detail[0].code + ' ')) if detail[0].code != 'invalid' else '' return 'Error {code} at item {i}: {detail} :: {item}'.format(code=code, i=i, detail=detail[0], item=item) else: errors = ', '.join(detail) codes = str([d.code for d in detail]) return 'Errors {codes} at item {i}: {errors} :: {item}'.format(codes=codes, i=i, errors=errors, item=item) def to_internal_value(self, data): """Body of run_validation for all data items""" if data is None: raise ValidationError('All tasks are empty (None)') if not isinstance(data, list): raise ValidationError('data is not a list') if len(data) == 0: raise ValidationError('data is empty') ret, errors = [], [] self.annotation_count, self.prediction_count = 0, 0 for i, item in enumerate(data): try: validated = self.validate(item) except ValidationError as exc: error = self.format_error(i, exc.detail, item) errors.append(error) # do not print to user too many errors if len(errors) >= 100: errors[99] = '...' break else: ret.append(validated) errors.append({}) if 'annotations' in item: self.annotation_count += len(item['annotations']) if 'predictions' in item: self.prediction_count += len(item['predictions']) if any(errors): logger.warning("Can't deserialize tasks due to " + str(errors)) raise ValidationError(errors) return ret def is_url(string): try: result = urlparse(string.strip()) return all([result.scheme, result.netloc]) except ValueError: return False