label-studio.git

"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license.
"""
import logging
from collections import OrderedDict
from typing import Any, Iterable, Tuple
from urllib.parse import unquote
 
import ujson as json
from core.feature_flags import flag_set
from core.utils.common import int_from_request
from data_manager.models import View
from data_manager.prepare_params import PrepareParams
from django.conf import settings
from rest_framework.generics import get_object_or_404
from tasks.models import Task
 
TASKS = 'tasks:'
logger = logging.getLogger(__name__)
 
 
class DataManagerException(Exception):
    pass
 
 
def get_all_columns(project, *_):
    """Make columns info for the frontend data manager"""
    result = {'columns': []}
 
    # frontend uses MST data model, so we need two directional referencing parent <-> child
    task_data_children = []
    i = 0
 
    data_types = OrderedDict()
 
    # add data types from config again
    project_data_types = {}
    for key, value in project.data_types.items():
        # skip keys from Repeater tag, because we already have its base data,
        # e.g.: skip 'image[{{idx}}]' because we have 'image' list already
        if '[' not in key:
            project_data_types[key] = value
    data_types.update(project_data_types.items())
 
    # all data types from import data
    all_data_columns = project.summary.all_data_columns
    logger.info(f'get_all_columns: project_id={project.id} {all_data_columns=} {data_types=}')
    if all_data_columns:
        data_types.update({key: 'Unknown' for key in all_data_columns if key not in data_types})
    logger.info(f'get_all_columns: project_id={project.id} {data_types=}')
 
    # remove $undefined$ if there is one type at least in labeling config, because it will be resolved automatically
    if len(project_data_types) > 0:
        data_types.pop(settings.DATA_UNDEFINED_NAME, None)
    logger.info(f'get_all_columns: project_id={project.id} {data_types=} {project_data_types=}')
 
    for key, data_type in list(data_types.items()):  # make data types from labeling config first
        column = {
            'id': key,
            'title': key if key != settings.DATA_UNDEFINED_NAME else 'data',
            'type': data_type if data_type in ['Image', 'Audio', 'AudioPlus', 'Video', 'Unknown'] else 'String',
            'target': 'tasks',
            'parent': 'data',
            'visibility_defaults': {
                'explore': True,
                'labeling': key in project_data_types or key == settings.DATA_UNDEFINED_NAME,
            },
            'project_defined': True,
        }
        result['columns'].append(column)
        task_data_children.append(column['id'])
        i += 1
 
    remove_members_schema = flag_set('fflag_feat_fit_449_datamanager_filter_members_short', user='auto')
 
    # --- Data root ---
    data_root = {
        'id': 'data',
        'title': 'data',
        'type': 'List',
        'target': 'tasks',
        'children': task_data_children,
        'project_defined': False,
    }
 
    result['columns'] += [
        # --- Tasks ---
        {
            'id': 'id',
            'title': 'ID',
            'type': 'Number',
            'help': 'Task ID',
            'target': 'tasks',
            'visibility_defaults': {'explore': True, 'labeling': False},
            'project_defined': False,
        }
    ]
 
    result['columns'] += [
        {
            'id': 'inner_id',
            'title': 'Inner ID',
            'type': 'Number',
            'help': 'Internal task ID starting from 1 for the current project',
            'target': 'tasks',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        }
    ]
 
    if remove_members_schema:
        project_members = []
    else:
        project_members = project.all_members.values_list('id', flat=True)
 
    result['columns'] += [
        {
            'id': 'completed_at',
            'title': 'Completed',
            'type': 'Datetime',
            'target': 'tasks',
            'help': 'Last annotation date',
            'visibility_defaults': {'explore': True, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'total_annotations',
            'title': 'Annotations',
            'type': 'Number',
            'target': 'tasks',
            'help': 'Total annotations per task',
            'visibility_defaults': {'explore': True, 'labeling': True},
            'project_defined': False,
        },
        {
            'id': 'cancelled_annotations',
            'title': 'Cancelled',
            'type': 'Number',
            'target': 'tasks',
            'help': 'Total cancelled (skipped) annotations',
            'visibility_defaults': {'explore': True, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'total_predictions',
            'title': 'Predictions',
            'type': 'Number',
            'target': 'tasks',
            'help': 'Total predictions per task',
            'visibility_defaults': {'explore': True, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'annotators',
            'title': 'Annotated by',
            'type': 'List',
            'target': 'tasks',
            'help': 'All users who completed the task',
            **({'schema': {'items': project_members}} if not remove_members_schema else {}),
            'visibility_defaults': {'explore': True, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'annotations_results',
            'title': 'Annotation results',
            'type': 'String',
            'target': 'tasks',
            'help': 'Annotation results stacked over all annotations',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'annotations_ids',
            'title': 'Annotation IDs',
            'type': 'String',
            'target': 'tasks',
            'help': 'Annotation IDs stacked over all annotations',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'predictions_score',
            'title': 'Prediction score',
            'type': 'Number',
            'target': 'tasks',
            'help': 'Average prediction score over all task predictions',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'predictions_model_versions',
            'title': 'Prediction model versions',
            'type': 'List',
            'target': 'tasks',
            'help': 'Model versions aggregated over all predictions',
            'schema': {'items': project.get_model_versions(), 'multiple': True},
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'predictions_results',
            'title': 'Prediction results',
            'type': 'String',
            'target': 'tasks',
            'help': 'Prediction results stacked over all predictions',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'file_upload',
            'title': 'Upload filename',
            'type': 'String',
            'target': 'tasks',
            'help': 'Filename of uploaded file',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'storage_filename',
            'title': 'Storage filename',
            'type': 'String',
            'target': 'tasks',
            'help': 'Filename from import storage',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'created_at',
            'title': 'Created at',
            'type': 'Datetime',
            'target': 'tasks',
            'help': 'Task creation time',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'updated_at',
            'title': 'Updated at',
            'type': 'Datetime',
            'target': 'tasks',
            'help': 'Task update time',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'updated_by',
            'title': 'Updated by',
            'type': 'List',
            'target': 'tasks',
            'help': 'User who did the last task update',
            **({'schema': {'items': project_members}} if not remove_members_schema else {}),
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'avg_lead_time',
            'title': 'Lead Time',
            'type': 'Number',
            'help': 'Average lead time over all annotations (seconds)',
            'target': 'tasks',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
        {
            'id': 'draft_exists',
            'title': 'Drafts',
            'type': 'Boolean',
            'help': 'True if at least one draft exists for the task',
            'target': 'tasks',
            'visibility_defaults': {'explore': False, 'labeling': False},
            'project_defined': False,
        },
    ]
 
    result['columns'].append(data_root)
 
    return result
 
 
def get_prepare_params(request, project):
    """This function extract prepare_params from
    * view_id if it's inside of request data
    * selectedItems, filters, ordering if they are in request and there is no view id
    """
    # use filters and selected items from view
    view_id = int_from_request(request.GET, 'view', 0) or int_from_request(request.data, 'view', 0)
    if view_id > 0:
        view = get_object_or_404(View, pk=view_id)
        if view.project.pk != project.pk:
            raise DataManagerException('Project and View mismatch')
        prepare_params = view.get_prepare_tasks_params(add_selected_items=True)
        prepare_params.request = request
 
    # use filters and selected items from request if it's specified
    else:
        # query arguments from url
        if 'query' in request.GET:
            data = json.loads(unquote(request.GET['query']))
        # data payload from body
        else:
            data = request.data
 
        selected = data.get('selectedItems', {'all': True, 'excluded': []})
        if not isinstance(selected, dict):
            if isinstance(selected, str):
                # try to parse JSON string
                try:
                    selected = json.loads(selected)
                except Exception as e:
                    logger.error(f'Error parsing selectedItems: {e}')
                    raise DataManagerException(
                        'selectedItems must be JSON encoded string for dict: {"all": [true|false], '
                        '"excluded | included": [...task_ids...]}. '
                        f'Found: {selected}'
                    )
            else:
                raise DataManagerException(
                    'selectedItems must be dict: {"all": [true|false], '
                    '"excluded | included": [...task_ids...]}. '
                    f'Found type: {type(selected)} with value: {selected}'
                )
        filters = data.get('filters', None)
        ordering = data.get('ordering', [])
        prepare_params = PrepareParams(
            project=project.id, selectedItems=selected, data=data, filters=filters, ordering=ordering, request=request
        )
    return prepare_params
 
 
def get_prepared_queryset(request, project):
    prepare_params = get_prepare_params(request, project)
    queryset = Task.prepared.only_filtered(prepare_params=prepare_params)
    return queryset
 
 
def evaluate_predictions(tasks):
    """
    Call the given ML backend to retrieve predictions with the task queryset as an input.
    If backend is not specified, we'll assume the tasks' project only has one associated
    ML backend, and use that backend.
    """
    if not tasks:
        return
 
    project = tasks[0].project
 
    backend = project.ml_backend
 
    if backend:
        return backend.predict_tasks(tasks=tasks)
 
 
def filters_ordering_selected_items_exist(data):
    return data.get('filters') or data.get('ordering') or data.get('selectedItems')
 
 
def custom_filter_expressions(*args, **kwargs):
    pass
 
 
def preprocess_filter(_filter, *_):
    return _filter
 
 
def preprocess_field_name(raw_field_name, project) -> Tuple[str, bool]:
    """Transform a field name (as specified in the datamanager views endpoint) to
    a django ORM field name. Also handle dotted accesses to task.data.
 
    Edit with care; it's critical that this function not be changed in ways that
    introduce vulnerabilities in the vein of the ORM Leak (see #5012). In particular
    it is not advisable to use `replace` or other calls that replace all instances
    of a string within this function.
 
    Returns: Django ORM field name: str, Sort is ascending: bool
    """
 
    field_name = raw_field_name
    ascending = True
 
    # Descending marker `-` may come at the beginning of the string
    if field_name.startswith('-'):
        ascending = False
        field_name = field_name[1:]
 
    # For security reasons, these must only be removed when they fall at the beginning of the string (or after `-`).
    optional_prefixes = ['filter:', 'tasks:']
    for prefix in optional_prefixes:
        if field_name.startswith(prefix):
            field_name = field_name[len(prefix) :]
 
    # Descending marker may also come after other prefixes. Double negative is not allowed.
    if ascending and field_name.startswith('-'):
        ascending = False
        field_name = field_name[1:]
 
    if field_name.startswith('data.'):
        # process as $undefined$ only if real_name is from labeling config, not from task.data
        real_name = field_name.replace('data.', '')
        common_data_columns = project.summary.common_data_columns
        real_name_suitable = (
            # there is only one object tag in labeling config
            # and requested filter name == value from object tag
            len(project.data_types.keys()) == 1
            and real_name in project.data_types.keys()
            # file was uploaded before labeling config is set, `data.data` is system predefined name
            or len(project.data_types.keys()) == 0
            and real_name == 'data'
        )
        if (
            real_name_suitable
            # common data columns are not None
            and common_data_columns
            # $undefined$ is in common data columns, in all tasks
            and settings.DATA_UNDEFINED_NAME in common_data_columns
        ):
            field_name = f'data__{settings.DATA_UNDEFINED_NAME}'
        else:
            field_name = field_name.replace('data.', 'data__')
    return field_name, ascending
 
 
def intersperse(items: Iterable, separator: Any) -> list:
    """
    Create a list with a separator between each item in the passed iterable `items`
 
    for example, intersperse(['one', 'two', 'three'], 0) == ['one', 0, 'two', 0, 'three']
    """
 
    output = []
    for item in items:
        output.append(item)
        output.append(separator)
    # if there are no items, there will be no last separator to remove
    if output:
        output.pop()
    return output