label-studio.git

"""
The script does the following:
 
1. Downloads all README.md files in the label-studio-ml repository https://github.com/HumanSignal/label-studio-ml-backend by path label_studio_ml/examples/{model_name}/README.md
2. Parses the README.md files to extract the following information:
- HEADER: enclosed in `---` (e.g. `---\n Header Content \n---`)
- BODY: The rest of the content after header
3. For each `model_name` in the label-studio-ml repository, it creates a new file in the docs/source/tutorials copying the README.md content
4. Additionally, it changes the file in docs/source/guide/ml_tutorials.html, adding HEADER as a new item in `cards` list:
    ---
    section: "Machine learning"
    meta_title: Machine Learning Example Tutorials
    meta_description: Tutorial documentation for setting up a machine learning model with predictions using PyTorch, GPT2, Sci-kit learn, and other popular frameworks.
    layout: templates
    cards:
    - title: Create a simple ML backend
      categories:
      - image classification
      - starter
      image: "/tutorials/simple-image-classification.png"
      url: "/tutorials/dummy_model.html"
    - title: ...
    ---
"""
 
import logging
import os
import re
from pathlib import Path
from typing import List
 
import yaml
 
ML_REPO_PATH = os.getenv('ML_REPO_PATH', '/ml/')
 
 
def get_readme_files() -> List:
    p = Path(ML_REPO_PATH) / 'label_studio_ml' / 'examples'
    return sorted(list(Path(p).rglob('README.md')))
 
 
def parse_readme_file(file_path: str) -> dict:
    print(file_path)
    with open(file_path, 'r') as f:
        content = f.read()
 
    match = re.search(r'---(.*?)---', content, re.DOTALL)
    header = match.group(1).strip() if match else ''
    body = content[content.find('-->') + 3 :].strip()
 
    return {'header': header, 'body': body}
 
 
def create_tutorial_files():
    readme_files = get_readme_files()
 
    files_and_headers = []
    for file in readme_files:
        model_name = file.parts[-2]
        tutorial_path = (
            Path(__file__).resolve().parent.parent / 'docs' / 'source' / 'guide' / 'ml_tutorials' / f'{model_name}.md'
        )
        tutorial_dir = os.path.dirname(tutorial_path)
        os.makedirs(tutorial_dir, exist_ok=True)
 
        parsed_content = parse_readme_file(file)
        with open(tutorial_path, 'w') as f:
            if parsed_content['header']:
                f.write('---\n')
                f.write(parsed_content['header'])
                f.write('\n---\n\n')
            f.write(parsed_content['body'])
        files_and_headers.append(
            {'model_name': model_name, 'header': yaml.load(parsed_content['header'], Loader=yaml.FullLoader)}
        )
 
    update_ml_tutorials_index(files_and_headers)
 
 
def update_ml_tutorials_index(files_and_headers: List):
    # Navigate to '../docs/source/guide/ml_tutorials.html' relative to the current script
    p = Path(__file__).resolve().parent.parent / 'docs' / 'source' / 'guide' / 'ml_tutorials.html'
    print(f'Reading file from {str(p)}')
    with open(str(p), 'r') as f:
        content = f.read()
 
    yaml_content = re.findall(r'---\n(.*?)\n---', content, re.DOTALL)
    # read in python dict
    data = yaml.load(yaml_content[0].strip(), Loader=yaml.FullLoader)
    data['cards'] = []
    print(data)
    for f in files_and_headers:
        h = f['header']
        if not isinstance(h, dict):
            logging.error(f'No dict header found in {f} file. Skipping ...')
            continue
        print('Processing', f['model_name'])
        card = {'title': h.get('title') or f['model_name'], 'url': f'/guide/ml_tutorials/{f["model_name"]}.html'}
        card.update(h)
        data['cards'].append(card)
 
    p = Path(__file__).resolve().parent.parent / 'docs' / 'source' / 'guide' / 'ml_tutorials.html'
    print(f'Updating {str(p)} ... ')
    with open(str(p), 'w') as f:
        f.write('---\n')
        f.write(yaml.dump(data))
        f.write('---\n')
 
 
create_tutorial_files()