Bin
2025-12-17 21f0498f62ada55651f4d232327e15fc47f498b1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license.
"""
import csv
import io
import logging
import mimetypes
import os
 
try:
    import ujson as json
except:  # noqa: E722
    import json
 
from core.utils.common import timeit
from core.utils.io import ssrf_safe_get
from django.conf import settings
from django.core.files.uploadedfile import SimpleUploadedFile
from rest_framework.exceptions import ValidationError
 
from .models import FileUpload
 
logger = logging.getLogger(__name__)
csv.field_size_limit(131072 * 10)
 
 
def is_binary(f):
    return isinstance(f, (io.RawIOBase, io.BufferedIOBase))
 
 
def csv_generate_header(file):
    """Generate column names for headless csv file"""
    file.seek(0)
    names = []
    line = file.readline()
 
    num_columns = len(line.split(b',' if isinstance(line, bytes) else ','))
    for i in range(num_columns):
        names.append('column' + str(i + 1))
    file.seek(0)
    return names
 
 
def check_max_task_number(tasks):
    # max tasks
    if len(tasks) > settings.TASKS_MAX_NUMBER:
        raise ValidationError(
            f'Maximum task number is {settings.TASKS_MAX_NUMBER}, ' f'current task number is {len(tasks)}'
        )
 
 
def check_tasks_max_file_size(value):
    if value >= settings.TASKS_MAX_FILE_SIZE:
        raise ValidationError(
            f'Maximum total size of all files is {settings.TASKS_MAX_FILE_SIZE} bytes, '
            f'current size is {value} bytes'
        )
 
 
def check_extensions(files):
    for filename, file_obj in files.items():
        _, ext = os.path.splitext(file_obj.name)
        if ext.lower() not in settings.SUPPORTED_EXTENSIONS:
            raise ValidationError(f'{ext} extension is not supported')
 
 
def check_request_files_size(files):
    total = sum([file.size for _, file in files.items()])
 
    check_tasks_max_file_size(total)
 
 
def create_file_upload(user, project, file):
    instance = FileUpload(user=user, project=project, file=file)
    if settings.SVG_SECURITY_CLEANUP:
        content_type, encoding = mimetypes.guess_type(str(instance.file.name))
        if content_type in ['image/svg+xml']:
            clean_xml = allowlist_svg(instance.file.read().decode())
            instance.file.seek(0)
            instance.file.write(clean_xml.encode())
            instance.file.truncate()
    instance.save()
    return instance
 
 
def allowlist_svg(dirty_xml):
    """Filter out malicious/harmful content from SVG files
    by defining allowed tags
    """
    from lxml.html import clean
 
    allow_tags = [
        'xml',
        'svg',
        'circle',
        'ellipse',
        'line',
        'path',
        'polygon',
        'vector',
        'rect',
    ]
 
    cleaner = clean.Cleaner(
        allow_tags=allow_tags,
        style=True,
        links=True,
        add_nofollow=False,
        page_structure=True,
        safe_attrs_only=False,
        remove_unknown_tags=False,
    )
 
    clean_xml = cleaner.clean_html(dirty_xml)
    return clean_xml
 
 
def str_to_json(data):
    try:
        json_acceptable_string = data.replace("'", '"')
        return json.loads(json_acceptable_string)
    except ValueError:
        return None
 
 
def tasks_from_url(file_upload_ids, project, user, url, could_be_tasks_list):
    """Download file using URL and read tasks from it"""
    # process URL with tasks
    try:
        filename = url.rsplit('/', 1)[-1]
 
        response = ssrf_safe_get(
            url, verify=project.organization.should_verify_ssl_certs(), stream=True, headers={'Accept-Encoding': None}
        )
 
        # Try to get filename from resolved URL after redirects
        resolved_url = response.url if hasattr(response, 'url') else url
        if resolved_url != url:
            # Parse filename from the resolved URL after redirect
            from urllib.parse import unquote, urlparse
 
            parsed_url = urlparse(resolved_url)
            path = unquote(parsed_url.path)
            resolved_filename = path.rsplit('/', 1)[-1]
            # Remove query parameters
            if '?' in resolved_filename:
                resolved_filename = resolved_filename.split('?')[0]
            _, resolved_ext = os.path.splitext(resolved_filename)
            filename = resolved_filename
 
        # Check file extension
        _, ext = os.path.splitext(filename)
        if ext and ext.lower() not in settings.SUPPORTED_EXTENSIONS:
            raise ValidationError(f'{ext} extension is not supported')
 
        # Check file size before downloading
        content_length = response.headers.get('content-length')
        if content_length:
            check_tasks_max_file_size(int(content_length))
 
        file_content = response.content
        file_upload = create_file_upload(user, project, SimpleUploadedFile(filename, file_content))
        if file_upload.format_could_be_tasks_list:
            could_be_tasks_list = True
        file_upload_ids.append(file_upload.id)
        tasks, found_formats, data_keys = FileUpload.load_tasks_from_uploaded_files(project, file_upload_ids)
 
    except ValidationError as e:
        raise e
    except Exception as e:
        raise ValidationError(str(e))
    return data_keys, found_formats, tasks, file_upload_ids, could_be_tasks_list
 
 
@timeit
def create_file_uploads(user, project, FILES):
    could_be_tasks_list = False
    file_upload_ids = []
    check_request_files_size(FILES)
    check_extensions(FILES)
    for _, file in FILES.items():
        file_upload = create_file_upload(user, project, file)
        if file_upload.format_could_be_tasks_list:
            could_be_tasks_list = True
        file_upload_ids.append(file_upload.id)
 
    logger.debug(f'created file uploads: {file_upload_ids} could_be_tasks_list: {could_be_tasks_list}')
    return file_upload_ids, could_be_tasks_list
 
 
def load_tasks_for_async_import(project_import, user):
    """Load tasks from different types of request.data / request.files saved in project_import model"""
    file_upload_ids, found_formats, data_keys = [], [], set()
 
    if project_import.file_upload_ids:
        file_upload_ids = project_import.file_upload_ids
        tasks, found_formats, data_keys = FileUpload.load_tasks_from_uploaded_files(
            project_import.project, file_upload_ids
        )
 
    # take tasks from url address
    elif project_import.url:
        url = project_import.url
        # try to load json with task or tasks from url as string
        json_data = str_to_json(url)
        if json_data:
            file_upload = create_file_upload(
                user,
                project_import.project,
                SimpleUploadedFile('inplace.json', url.encode()),
            )
            file_upload_ids.append(file_upload.id)
            tasks, found_formats, data_keys = FileUpload.load_tasks_from_uploaded_files(
                project_import.project, file_upload_ids
            )
 
        # download file using url and read tasks from it
        else:
            could_be_tasks_list = False
            (
                data_keys,
                found_formats,
                tasks,
                file_upload_ids,
                could_be_tasks_list,
            ) = tasks_from_url(file_upload_ids, project_import.project, user, url, could_be_tasks_list)
            if could_be_tasks_list:
                project_import.could_be_tasks_list = True
                project_import.save(update_fields=['could_be_tasks_list'])
 
    elif project_import.tasks:
        tasks = project_import.tasks
 
    # check is data root is list
    if not isinstance(tasks, list):
        raise ValidationError('load_tasks: Data root must be list')
 
    # empty tasks error
    if not tasks:
        raise ValidationError('load_tasks: No tasks added')
 
    check_max_task_number(tasks)
    return tasks, file_upload_ids, found_formats, list(data_keys)
 
 
def load_tasks_for_async_import_streaming(project_import, user, batch_size=1000):
    """Load tasks from different types of request.data / request.files saved in project_import model,
    yielding tasks in batches to reduce memory usage"""
    from django.conf import settings
 
    if not batch_size:
        batch_size = settings.IMPORT_BATCH_SIZE
 
    all_file_upload_ids = []
    all_found_formats = {}
    all_data_keys = set()
 
    if project_import.file_upload_ids:
        file_upload_ids = project_import.file_upload_ids
        all_file_upload_ids = file_upload_ids.copy()
 
        for batch_tasks, batch_formats, batch_data_keys in FileUpload.load_tasks_from_uploaded_files_streaming(
            project_import.project, file_upload_ids, batch_size=batch_size
        ):
            all_found_formats.update(batch_formats)
            all_data_keys.update(batch_data_keys)
 
            # Validate each batch
            if not isinstance(batch_tasks, list):
                raise ValidationError('load_tasks: Data root must be list')
            if not batch_tasks:
                continue  # Skip empty batches
 
            check_max_task_number(batch_tasks)
            yield batch_tasks, file_upload_ids, batch_formats, list(batch_data_keys)
 
    elif project_import.url:
        # For URL imports, we still need to load everything at once
        # since we don't have streaming support for URL-based imports yet
        url = project_import.url
        file_upload_ids, found_formats, data_keys = [], [], set()
 
        # try to load json with task or tasks from url as string
        json_data = str_to_json(url)
        if json_data:
            file_upload = create_file_upload(
                user,
                project_import.project,
                SimpleUploadedFile('inplace.json', url.encode()),
            )
            file_upload_ids.append(file_upload.id)
            tasks, found_formats, data_keys = FileUpload.load_tasks_from_uploaded_files(
                project_import.project, file_upload_ids
            )
        else:
            could_be_tasks_list = False
            (
                data_keys,
                found_formats,
                tasks,
                file_upload_ids,
                could_be_tasks_list,
            ) = tasks_from_url(file_upload_ids, project_import.project, user, url, could_be_tasks_list)
            if could_be_tasks_list:
                project_import.could_be_tasks_list = True
                project_import.save(update_fields=['could_be_tasks_list'])
 
        if not isinstance(tasks, list):
            raise ValidationError('load_tasks: Data root must be list')
        if not tasks:
            raise ValidationError('load_tasks: No tasks added')
 
        check_max_task_number(tasks)
 
        all_file_upload_ids = file_upload_ids.copy()
        all_found_formats = found_formats.copy()
        all_data_keys = data_keys.copy()
 
        for i in range(0, len(tasks), batch_size):
            batch_tasks = tasks[i : i + batch_size]
            yield batch_tasks, file_upload_ids, found_formats, list(data_keys)
 
    elif project_import.tasks:
        tasks = project_import.tasks
 
        if not isinstance(tasks, list):
            raise ValidationError('load_tasks: Data root must be list')
        if not tasks:
            raise ValidationError('load_tasks: No tasks added')
 
        check_max_task_number(tasks)
 
        for i in range(0, len(tasks), batch_size):
            batch_tasks = tasks[i : i + batch_size]
            yield batch_tasks, [], {}, []
 
    else:
        raise ValidationError('load_tasks: No tasks added')
 
    return all_file_upload_ids, all_found_formats, list(all_data_keys)
 
 
def load_tasks(request, project):
    """Load tasks from different types of request.data / request.files"""
    file_upload_ids, found_formats, data_keys = [], [], set()
    could_be_tasks_list = False
 
    # take tasks from request FILES
    if len(request.FILES):
        check_request_files_size(request.FILES)
        check_extensions(request.FILES)
        for filename, file in request.FILES.items():
            file_upload = create_file_upload(request.user, project, file)
            if file_upload.format_could_be_tasks_list:
                could_be_tasks_list = True
            file_upload_ids.append(file_upload.id)
        tasks, found_formats, data_keys = FileUpload.load_tasks_from_uploaded_files(project, file_upload_ids)
 
    # take tasks from url address
    elif 'application/x-www-form-urlencoded' in request.content_type:
        # empty url
        url = request.data.get('url')
        if not url:
            raise ValidationError('"url" is not found in request data')
 
        # try to load json with task or tasks from url as string
        json_data = str_to_json(url)
        if json_data:
            file_upload = create_file_upload(request.user, project, SimpleUploadedFile('inplace.json', url.encode()))
            file_upload_ids.append(file_upload.id)
            tasks, found_formats, data_keys = FileUpload.load_tasks_from_uploaded_files(project, file_upload_ids)
 
        # download file using url and read tasks from it
        else:
            (
                data_keys,
                found_formats,
                tasks,
                file_upload_ids,
                could_be_tasks_list,
            ) = tasks_from_url(file_upload_ids, project, request.user, url, could_be_tasks_list)
 
    # take one task from request DATA
    elif 'application/json' in request.content_type and isinstance(request.data, dict):
        tasks = [request.data]
 
    # take many tasks from request DATA
    elif 'application/json' in request.content_type and isinstance(request.data, list):
        tasks = request.data
 
    # incorrect data source
    else:
        raise ValidationError('load_tasks: No data found in DATA or in FILES')
 
    # check is data root is list
    if not isinstance(tasks, list):
        raise ValidationError('load_tasks: Data root must be list')
 
    # empty tasks error
    if not tasks:
        raise ValidationError('load_tasks: No tasks added')
 
    check_max_task_number(tasks)
    return tasks, file_upload_ids, could_be_tasks_list, found_formats, list(data_keys)