label-studio.git

import fnmatch
import logging
import re
import types
 
from azure.storage.blob import BlobServiceClient
from core.utils.params import get_env
from django.conf import settings
from io_storages.utils import parse_range
 
logger = logging.getLogger(__name__)
 
 
class AZURE(object):
    @staticmethod
    def download_stream_response(blob_client, total_size, content_type, range_header, properties, max_range_size=None):
        """Prepare Azure blob streaming response with unified range handling.
 
        Shared Azure Blob streaming helper used by both OSS Azure Blob and Enterprise Azure SPI providers.
 
        Responsibilities:
        - Parse and normalize HTTP Range requests (including special probes)
        - Configure Azure SDK streaming parameters
        - Generate a downloader with a unified ``iter_chunks`` API
        - Build response metadata (Content-Range, Content-Length, ETag, Last-Modified)
 
        Args:
            blob_client: Azure Blob SDK client for the target blob.
            total_size (int): Size of the blob in bytes.
            content_type (str|None): Blob content type.
            range_header (str|None): Incoming HTTP Range header, e.g. 'bytes=0-'.
            properties: Blob properties (for ETag/Last-Modified extraction).
            max_range_size (int|None): Optional override for initial open-ended range size.
 
        Returns:
            tuple: (downloader, resolved_content_type, metadata)
        """
        resolved_content_type = content_type or 'application/octet-stream'
 
        streaming = True
        start, end = parse_range(range_header)
 
        if start is None and end is None:
            streaming = False
            start, end = 0, total_size
        elif start == 0 and end == 0:
            start, end = 0, 1
        elif start == 0 and (end == '' or end is None):
            mr = max_range_size if max_range_size is not None else settings.RESOLVER_PROXY_MAX_RANGE_SIZE
            end = start + mr
 
        if start is None:
            start = 0
 
        try:
            blob_client._config.max_single_get_size = 1024  # 1KB
        except Exception:
            pass
 
        if end is not None and end != '':
            length = end - start
        else:
            length = None
 
        if streaming:
            downloader = blob_client.download_blob(offset=start, length=length)
        else:
            length = total_size
            downloader = blob_client.download_blob()
 
        def _iter_chunks(self_downloader, chunk_size=1024 * 1024):
            try:
                self_downloader._config.max_chunk_get_size = chunk_size
            except Exception:
                pass
            total = 0
            for chunk in self_downloader.chunks():
                yield chunk
                total += len(chunk)
                if length is not None and total >= length:
                    return
 
        downloader.iter_chunks = types.MethodType(_iter_chunks, downloader)
        downloader.close = types.MethodType(lambda self: None, downloader)
 
        if streaming and length is not None:
            actual_length = min(length, max(0, total_size - start))
            content_length = actual_length
        else:
            content_length = length if length is not None else max(0, total_size - start)
 
        if length is not None:
            actual_end = min(start + length - 1, max(0, total_size - 1))
        else:
            actual_end = max(0, total_size - 1)
 
        status_code = 206 if streaming else 200
 
        metadata = {
            'ETag': getattr(properties, 'etag', ''),
            'ContentLength': content_length,
            'ContentRange': f'bytes {start}-{actual_end}/{total_size or 0}',
            'LastModified': getattr(properties, 'last_modified', None),
            'StatusCode': status_code,
        }
 
        return downloader, resolved_content_type, metadata
 
    @classmethod
    def get_client_and_container(cls, container, account_name=None, account_key=None):
        # get account name and key from params or from environment variables
        account_name = str(account_name) if account_name else get_env('AZURE_BLOB_ACCOUNT_NAME')
        account_key = str(account_key) if account_key else get_env('AZURE_BLOB_ACCOUNT_KEY')
        # check that both account name and key are set
        if not account_name or not account_key:
            raise ValueError(
                'Azure account name and key must be set using '
                'environment variables AZURE_BLOB_ACCOUNT_NAME and AZURE_BLOB_ACCOUNT_KEY'
            )
        connection_string = (
            'DefaultEndpointsProtocol=https;AccountName='
            + account_name
            + ';AccountKey='
            + account_key
            + ';EndpointSuffix=core.windows.net'
        )
        client = BlobServiceClient.from_connection_string(conn_str=connection_string)
        container = client.get_container_client(str(container))
        return client, container
 
    @classmethod
    def get_blob_metadata(cls, url: str, container: str, account_name: str = None, account_key: str = None) -> dict:
        """
        Get blob metadata by url
        :param url: Object key
        :param container: Azure container name
        :param account_name: Azure account name
        :param account_key: Azure account key
        :return: Object metadata dict("name": "value")
        """
        _, container = cls.get_client_and_container(container, account_name=account_name, account_key=account_key)
        blob = container.get_blob_client(url)
        return dict(blob.get_blob_properties())
 
    @classmethod
    def validate_pattern(cls, storage, pattern, glob_pattern=True):
        """
        Validate pattern against Azure Blob Storage
        :param storage: AzureBlobStorage instance
        :param pattern: Pattern to validate
        :param glob_pattern: If True, pattern is a glob pattern, otherwise it is a regex pattern
        :return: Message if pattern is not valid, empty string otherwise
        """
        logger.debug('Validating Azure Blob Storage pattern.')
        client, container = storage.get_client_and_container()
        if storage.prefix:
            generator = container.list_blob_names(
                name_starts_with=storage.prefix,
                results_per_page=settings.CLOUD_STORAGE_CHECK_FOR_RECORDS_PAGE_SIZE,
                timeout=settings.CLOUD_STORAGE_CHECK_FOR_RECORDS_TIMEOUT,
            )
        else:
            generator = container.list_blob_names(
                results_per_page=settings.CLOUD_STORAGE_CHECK_FOR_RECORDS_PAGE_SIZE,
                timeout=settings.CLOUD_STORAGE_CHECK_FOR_RECORDS_TIMEOUT,
            )
        # compile pattern to regex
        if glob_pattern:
            pattern = fnmatch.translate(pattern)
        regex = re.compile(str(pattern))
        # match pattern against all keys in the container
        for index, key in enumerate(generator):
            # skip directories
            if key.endswith('/'):
                logger.debug(key + ' is skipped because it is a folder')
                continue
            if regex and regex.match(key):
                logger.debug(key + ' matches file pattern')
                return ''
        return 'No objects found matching the provided glob pattern'