"""
|
The script does the following:
|
|
1. Downloads all README.md files in the label-studio-ml repository https://github.com/HumanSignal/label-studio-ml-backend by path label_studio_ml/examples/{model_name}/README.md
|
2. Parses the README.md files to extract the following information:
|
- HEADER: enclosed in `---` (e.g. `---\n Header Content \n---`)
|
- BODY: The rest of the content after header
|
3. For each `model_name` in the label-studio-ml repository, it creates a new file in the docs/source/tutorials copying the README.md content
|
4. Additionally, it changes the file in docs/source/guide/ml_tutorials.html, adding HEADER as a new item in `cards` list:
|
---
|
section: "Machine learning"
|
meta_title: Machine Learning Example Tutorials
|
meta_description: Tutorial documentation for setting up a machine learning model with predictions using PyTorch, GPT2, Sci-kit learn, and other popular frameworks.
|
layout: templates
|
cards:
|
- title: Create a simple ML backend
|
categories:
|
- image classification
|
- starter
|
image: "/tutorials/simple-image-classification.png"
|
url: "/tutorials/dummy_model.html"
|
- title: ...
|
---
|
"""
|
|
import logging
|
import os
|
import re
|
from pathlib import Path
|
from typing import List
|
|
import yaml
|
|
ML_REPO_PATH = os.getenv('ML_REPO_PATH', '/ml/')
|
|
|
def get_readme_files() -> List:
|
p = Path(ML_REPO_PATH) / 'label_studio_ml' / 'examples'
|
return sorted(list(Path(p).rglob('README.md')))
|
|
|
def parse_readme_file(file_path: str) -> dict:
|
print(file_path)
|
with open(file_path, 'r') as f:
|
content = f.read()
|
|
match = re.search(r'---(.*?)---', content, re.DOTALL)
|
header = match.group(1).strip() if match else ''
|
body = content[content.find('-->') + 3 :].strip()
|
|
return {'header': header, 'body': body}
|
|
|
def create_tutorial_files():
|
readme_files = get_readme_files()
|
|
files_and_headers = []
|
for file in readme_files:
|
model_name = file.parts[-2]
|
tutorial_path = (
|
Path(__file__).resolve().parent.parent / 'docs' / 'source' / 'guide' / 'ml_tutorials' / f'{model_name}.md'
|
)
|
tutorial_dir = os.path.dirname(tutorial_path)
|
os.makedirs(tutorial_dir, exist_ok=True)
|
|
parsed_content = parse_readme_file(file)
|
with open(tutorial_path, 'w') as f:
|
if parsed_content['header']:
|
f.write('---\n')
|
f.write(parsed_content['header'])
|
f.write('\n---\n\n')
|
f.write(parsed_content['body'])
|
files_and_headers.append(
|
{'model_name': model_name, 'header': yaml.load(parsed_content['header'], Loader=yaml.FullLoader)}
|
)
|
|
update_ml_tutorials_index(files_and_headers)
|
|
|
def update_ml_tutorials_index(files_and_headers: List):
|
# Navigate to '../docs/source/guide/ml_tutorials.html' relative to the current script
|
p = Path(__file__).resolve().parent.parent / 'docs' / 'source' / 'guide' / 'ml_tutorials.html'
|
print(f'Reading file from {str(p)}')
|
with open(str(p), 'r') as f:
|
content = f.read()
|
|
yaml_content = re.findall(r'---\n(.*?)\n---', content, re.DOTALL)
|
# read in python dict
|
data = yaml.load(yaml_content[0].strip(), Loader=yaml.FullLoader)
|
data['cards'] = []
|
print(data)
|
for f in files_and_headers:
|
h = f['header']
|
if not isinstance(h, dict):
|
logging.error(f'No dict header found in {f} file. Skipping ...')
|
continue
|
print('Processing', f['model_name'])
|
card = {'title': h.get('title') or f['model_name'], 'url': f'/guide/ml_tutorials/{f["model_name"]}.html'}
|
card.update(h)
|
data['cards'].append(card)
|
|
p = Path(__file__).resolve().parent.parent / 'docs' / 'source' / 'guide' / 'ml_tutorials.html'
|
print(f'Updating {str(p)} ... ')
|
with open(str(p), 'w') as f:
|
f.write('---\n')
|
f.write(yaml.dump(data))
|
f.write('---\n')
|
|
|
create_tutorial_files()
|