feat(ml): composable ml (#9973)

* modularize model classes

* various fixes

* expose port

* change response

* round coordinates

* simplify preload

* update server

* simplify interface

simplify

* update tests

* composable endpoint

* cleanup

fixes

remove unnecessary interface

support text input, cleanup

* ew camelcase

* update server

server fixes

fix typing

* ml fixes

update locustfile

fixes

* cleaner response

* better repo response

* update tests

formatting and typing

rename

* undo compose change

* linting

fix type

actually fix typing

* stricter typing

fix detection-only response

no need for defaultdict

* update spec file

update api

linting

* update e2e

* unnecessary dimension

* remove commented code

* remove duplicate code

* remove unused imports

* add batch dim
This commit is contained in:
Mert
2024-06-06 23:09:47 -04:00
committed by GitHub
parent 7a46f80ddc
commit 2b1b43a7e4
39 changed files with 982 additions and 999 deletions

View File

@@ -1,24 +1,40 @@
from typing import Any
from app.schemas import ModelType
from app.models.base import InferenceModel
from app.models.clip.textual import MClipTextualEncoder, OpenClipTextualEncoder
from app.models.clip.visual import OpenClipVisualEncoder
from app.schemas import ModelSource, ModelTask, ModelType
from .base import InferenceModel
from .clip import MCLIPEncoder, OpenCLIPEncoder
from .constants import is_insightface, is_mclip, is_openclip
from .facial_recognition import FaceRecognizer
from .constants import get_model_source
from .facial_recognition.detection import FaceDetector
from .facial_recognition.recognition import FaceRecognizer
def from_model_type(model_type: ModelType, model_name: str, **model_kwargs: Any) -> InferenceModel:
match model_type:
case ModelType.CLIP:
if is_openclip(model_name):
return OpenCLIPEncoder(model_name, **model_kwargs)
elif is_mclip(model_name):
return MCLIPEncoder(model_name, **model_kwargs)
case ModelType.FACIAL_RECOGNITION:
if is_insightface(model_name):
return FaceRecognizer(model_name, **model_kwargs)
def get_model_class(model_name: str, model_type: ModelType, model_task: ModelTask) -> type[InferenceModel]:
source = get_model_source(model_name)
match source, model_type, model_task:
case ModelSource.OPENCLIP | ModelSource.MCLIP, ModelType.VISUAL, ModelTask.SEARCH:
return OpenClipVisualEncoder
case ModelSource.OPENCLIP, ModelType.TEXTUAL, ModelTask.SEARCH:
return OpenClipTextualEncoder
case ModelSource.MCLIP, ModelType.TEXTUAL, ModelTask.SEARCH:
return MClipTextualEncoder
case ModelSource.INSIGHTFACE, ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION:
return FaceDetector
case ModelSource.INSIGHTFACE, ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION:
return FaceRecognizer
case _:
raise ValueError(f"Unknown model type {model_type}")
raise ValueError(f"Unknown model combination: {source}, {model_type}, {model_task}")
raise ValueError(f"Unknown {model_type} model {model_name}")
def from_model_type(model_name: str, model_type: ModelType, model_task: ModelTask, **kwargs: Any) -> InferenceModel:
return get_model_class(model_name, model_type, model_task)(model_name, **kwargs)
def get_model_deps(model_name: str, model_type: ModelType, model_task: ModelTask) -> list[tuple[ModelType, ModelTask]]:
return get_model_class(model_name, model_type, model_task).depends

View File

@@ -3,7 +3,7 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from pathlib import Path
from shutil import rmtree
from typing import Any
from typing import Any, ClassVar
import onnxruntime as ort
from huggingface_hub import snapshot_download
@@ -11,13 +11,14 @@ from huggingface_hub import snapshot_download
import ann.ann
from app.models.constants import SUPPORTED_PROVIDERS
from ..config import get_cache_dir, get_hf_model_name, log, settings
from ..schemas import ModelRuntime, ModelType
from ..config import clean_name, log, settings
from ..schemas import ModelFormat, ModelIdentity, ModelSession, ModelTask, ModelType
from .ann import AnnSession
class InferenceModel(ABC):
_model_type: ModelType
depends: ClassVar[list[ModelIdentity]]
identity: ClassVar[ModelIdentity]
def __init__(
self,
@@ -26,16 +27,16 @@ class InferenceModel(ABC):
providers: list[str] | None = None,
provider_options: list[dict[str, Any]] | None = None,
sess_options: ort.SessionOptions | None = None,
preferred_runtime: ModelRuntime | None = None,
preferred_format: ModelFormat | None = None,
**model_kwargs: Any,
) -> None:
self.loaded = False
self.model_name = model_name
self.model_name = clean_name(model_name)
self.cache_dir = Path(cache_dir) if cache_dir is not None else self.cache_dir_default
self.providers = providers if providers is not None else self.providers_default
self.provider_options = provider_options if provider_options is not None else self.provider_options_default
self.sess_options = sess_options if sess_options is not None else self.sess_options_default
self.preferred_runtime = preferred_runtime if preferred_runtime is not None else self.preferred_runtime_default
self.preferred_format = preferred_format if preferred_format is not None else self.preferred_format_default
def download(self) -> None:
if not self.cached:
@@ -47,35 +48,36 @@ class InferenceModel(ABC):
def load(self) -> None:
if self.loaded:
return
self.download()
log.info(f"Loading {self.model_type.replace('-', ' ')} model '{self.model_name}' to memory")
self._load()
self.session = self._load()
self.loaded = True
def predict(self, inputs: Any, **model_kwargs: Any) -> Any:
def predict(self, *inputs: Any, **model_kwargs: Any) -> Any:
self.load()
if model_kwargs:
self.configure(**model_kwargs)
return self._predict(inputs)
return self._predict(*inputs, **model_kwargs)
@abstractmethod
def _predict(self, inputs: Any) -> Any: ...
def _predict(self, *inputs: Any, **model_kwargs: Any) -> Any: ...
def configure(self, **model_kwargs: Any) -> None:
def configure(self, **kwargs: Any) -> None:
pass
def _download(self) -> None:
ignore_patterns = [] if self.preferred_runtime == ModelRuntime.ARMNN else ["*.armnn"]
ignore_patterns = [] if self.preferred_format == ModelFormat.ARMNN else ["*.armnn"]
snapshot_download(
get_hf_model_name(self.model_name),
f"immich-app/{clean_name(self.model_name)}",
cache_dir=self.cache_dir,
local_dir=self.cache_dir,
local_dir_use_symlinks=False,
ignore_patterns=ignore_patterns,
)
@abstractmethod
def _load(self) -> None: ...
def _load(self) -> ModelSession:
return self._make_session(self.model_path)
def clear_cache(self) -> None:
if not self.cache_dir.exists():
@@ -99,7 +101,7 @@ class InferenceModel(ABC):
self.cache_dir.unlink()
self.cache_dir.mkdir(parents=True, exist_ok=True)
def _make_session(self, model_path: Path) -> AnnSession | ort.InferenceSession:
def _make_session(self, model_path: Path) -> ModelSession:
if not model_path.is_file():
onnx_path = model_path.with_suffix(".onnx")
if not onnx_path.is_file():
@@ -124,9 +126,21 @@ class InferenceModel(ABC):
raise ValueError(f"Unsupported model file type: {model_path.suffix}")
return session
@property
def model_dir(self) -> Path:
return self.cache_dir / self.model_type.value
@property
def model_path(self) -> Path:
return self.model_dir / f"model.{self.preferred_format}"
@property
def model_task(self) -> ModelTask:
return self.identity[1]
@property
def model_type(self) -> ModelType:
return self._model_type
return self.identity[0]
@property
def cache_dir(self) -> Path:
@@ -138,11 +152,11 @@ class InferenceModel(ABC):
@property
def cache_dir_default(self) -> Path:
return get_cache_dir(self.model_name, self.model_type)
return settings.cache_folder / self.model_task.value / self.model_name
@property
def cached(self) -> bool:
return self.cache_dir.is_dir() and any(self.cache_dir.iterdir())
return self.model_path.is_file()
@property
def providers(self) -> list[str]:
@@ -226,14 +240,14 @@ class InferenceModel(ABC):
return sess_options
@property
def preferred_runtime(self) -> ModelRuntime:
return self._preferred_runtime
def preferred_format(self) -> ModelFormat:
return self._preferred_format
@preferred_runtime.setter
def preferred_runtime(self, preferred_runtime: ModelRuntime) -> None:
log.debug(f"Setting preferred runtime to {preferred_runtime}")
self._preferred_runtime = preferred_runtime
@preferred_format.setter
def preferred_format(self, preferred_format: ModelFormat) -> None:
log.debug(f"Setting preferred format to {preferred_format}")
self._preferred_format = preferred_format
@property
def preferred_runtime_default(self) -> ModelRuntime:
return ModelRuntime.ARMNN if ann.ann.is_available and settings.ann else ModelRuntime.ONNX
def preferred_format_default(self) -> ModelFormat:
return ModelFormat.ARMNN if ann.ann.is_available and settings.ann else ModelFormat.ONNX

View File

@@ -5,9 +5,9 @@ from aiocache.lock import OptimisticLock
from aiocache.plugins import TimingPlugin
from app.models import from_model_type
from app.models.base import InferenceModel
from ..schemas import ModelType, has_profiling
from .base import InferenceModel
from ..schemas import ModelTask, ModelType, has_profiling
class ModelCache:
@@ -31,28 +31,21 @@ class ModelCache:
if profiling:
plugins.append(TimingPlugin())
self.revalidate_enable = revalidate
self.should_revalidate = revalidate
self.cache = SimpleMemoryCache(timeout=timeout, plugins=plugins, namespace=None)
async def get(self, model_name: str, model_type: ModelType, **model_kwargs: Any) -> InferenceModel:
"""
Args:
model_name: Name of model in the model hub used for the task.
model_type: Model type or task, which determines which model zoo is used.
Returns:
model: The requested model.
"""
key = f"{model_name}{model_type.value}{model_kwargs.get('mode', '')}"
async def get(
self, model_name: str, model_type: ModelType, model_task: ModelTask, **model_kwargs: Any
) -> InferenceModel:
key = f"{model_name}{model_type}{model_task}"
async with OptimisticLock(self.cache, key) as lock:
model: InferenceModel | None = await self.cache.get(key)
if model is None:
model = from_model_type(model_type, model_name, **model_kwargs)
model = from_model_type(model_name, model_type, model_task, **model_kwargs)
await lock.cas(model, ttl=model_kwargs.get("ttl", None))
elif self.revalidate_enable:
elif self.should_revalidate:
await self.revalidate(key, model_kwargs.get("ttl", None))
return model

View File

@@ -1,189 +0,0 @@
import json
from abc import abstractmethod
from functools import cached_property
from io import BytesIO
from pathlib import Path
from typing import Any, Literal
import numpy as np
from numpy.typing import NDArray
from PIL import Image
from tokenizers import Encoding, Tokenizer
from app.config import clean_name, log
from app.models.transforms import crop, get_pil_resampling, normalize, resize, to_numpy
from app.schemas import ModelType
from .base import InferenceModel
class BaseCLIPEncoder(InferenceModel):
_model_type = ModelType.CLIP
def __init__(
self,
model_name: str,
cache_dir: Path | str | None = None,
mode: Literal["text", "vision"] | None = None,
**model_kwargs: Any,
) -> None:
self.mode = mode
super().__init__(model_name, cache_dir, **model_kwargs)
def _load(self) -> None:
if self.mode == "text" or self.mode is None:
log.debug(f"Loading clip text model '{self.model_name}'")
self.text_model = self._make_session(self.textual_path)
log.debug(f"Loaded clip text model '{self.model_name}'")
if self.mode == "vision" or self.mode is None:
log.debug(f"Loading clip vision model '{self.model_name}'")
self.vision_model = self._make_session(self.visual_path)
log.debug(f"Loaded clip vision model '{self.model_name}'")
def _predict(self, image_or_text: Image.Image | str) -> NDArray[np.float32]:
if isinstance(image_or_text, bytes):
image_or_text = Image.open(BytesIO(image_or_text))
match image_or_text:
case Image.Image():
if self.mode == "text":
raise TypeError("Cannot encode image as text-only model")
outputs: NDArray[np.float32] = self.vision_model.run(None, self.transform(image_or_text))[0][0]
case str():
if self.mode == "vision":
raise TypeError("Cannot encode text as vision-only model")
outputs = self.text_model.run(None, self.tokenize(image_or_text))[0][0]
case _:
raise TypeError(f"Expected Image or str, but got: {type(image_or_text)}")
return outputs
@abstractmethod
def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
pass
@abstractmethod
def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
pass
@property
def textual_dir(self) -> Path:
return self.cache_dir / "textual"
@property
def visual_dir(self) -> Path:
return self.cache_dir / "visual"
@property
def model_cfg_path(self) -> Path:
return self.cache_dir / "config.json"
@property
def textual_path(self) -> Path:
return self.textual_dir / f"model.{self.preferred_runtime}"
@property
def visual_path(self) -> Path:
return self.visual_dir / f"model.{self.preferred_runtime}"
@property
def tokenizer_file_path(self) -> Path:
return self.textual_dir / "tokenizer.json"
@property
def tokenizer_cfg_path(self) -> Path:
return self.textual_dir / "tokenizer_config.json"
@property
def preprocess_cfg_path(self) -> Path:
return self.visual_dir / "preprocess_cfg.json"
@property
def cached(self) -> bool:
return self.textual_path.is_file() and self.visual_path.is_file()
@cached_property
def model_cfg(self) -> dict[str, Any]:
log.debug(f"Loading model config for CLIP model '{self.model_name}'")
model_cfg: dict[str, Any] = json.load(self.model_cfg_path.open())
log.debug(f"Loaded model config for CLIP model '{self.model_name}'")
return model_cfg
@cached_property
def tokenizer_file(self) -> dict[str, Any]:
log.debug(f"Loading tokenizer file for CLIP model '{self.model_name}'")
tokenizer_file: dict[str, Any] = json.load(self.tokenizer_file_path.open())
log.debug(f"Loaded tokenizer file for CLIP model '{self.model_name}'")
return tokenizer_file
@cached_property
def tokenizer_cfg(self) -> dict[str, Any]:
log.debug(f"Loading tokenizer config for CLIP model '{self.model_name}'")
tokenizer_cfg: dict[str, Any] = json.load(self.tokenizer_cfg_path.open())
log.debug(f"Loaded tokenizer config for CLIP model '{self.model_name}'")
return tokenizer_cfg
@cached_property
def preprocess_cfg(self) -> dict[str, Any]:
log.debug(f"Loading visual preprocessing config for CLIP model '{self.model_name}'")
preprocess_cfg: dict[str, Any] = json.load(self.preprocess_cfg_path.open())
log.debug(f"Loaded visual preprocessing config for CLIP model '{self.model_name}'")
return preprocess_cfg
class OpenCLIPEncoder(BaseCLIPEncoder):
def __init__(
self,
model_name: str,
cache_dir: Path | str | None = None,
mode: Literal["text", "vision"] | None = None,
**model_kwargs: Any,
) -> None:
super().__init__(clean_name(model_name), cache_dir, mode, **model_kwargs)
def _load(self) -> None:
super()._load()
self._load_tokenizer()
size: list[int] | int = self.preprocess_cfg["size"]
self.size = size[0] if isinstance(size, list) else size
self.resampling = get_pil_resampling(self.preprocess_cfg["interpolation"])
self.mean = np.array(self.preprocess_cfg["mean"], dtype=np.float32)
self.std = np.array(self.preprocess_cfg["std"], dtype=np.float32)
def _load_tokenizer(self) -> Tokenizer:
log.debug(f"Loading tokenizer for CLIP model '{self.model_name}'")
text_cfg: dict[str, Any] = self.model_cfg["text_cfg"]
context_length: int = text_cfg.get("context_length", 77)
pad_token: str = self.tokenizer_cfg["pad_token"]
self.tokenizer: Tokenizer = Tokenizer.from_file(self.tokenizer_file_path.as_posix())
pad_id: int = self.tokenizer.token_to_id(pad_token)
self.tokenizer.enable_padding(length=context_length, pad_token=pad_token, pad_id=pad_id)
self.tokenizer.enable_truncation(max_length=context_length)
log.debug(f"Loaded tokenizer for CLIP model '{self.model_name}'")
def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
tokens: Encoding = self.tokenizer.encode(text)
return {"text": np.array([tokens.ids], dtype=np.int32)}
def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
image = resize(image, self.size)
image = crop(image, self.size)
image_np = to_numpy(image)
image_np = normalize(image_np, self.mean, self.std)
return {"image": np.expand_dims(image_np.transpose(2, 0, 1), 0)}
class MCLIPEncoder(OpenCLIPEncoder):
def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
tokens: Encoding = self.tokenizer.encode(text)
return {
"input_ids": np.array([tokens.ids], dtype=np.int32),
"attention_mask": np.array([tokens.attention_mask], dtype=np.int32),
}

View File

@@ -0,0 +1,98 @@
import json
from abc import abstractmethod
from functools import cached_property
from pathlib import Path
from typing import Any
import numpy as np
from numpy.typing import NDArray
from tokenizers import Encoding, Tokenizer
from app.config import log
from app.models.base import InferenceModel
from app.schemas import ModelSession, ModelTask, ModelType
class BaseCLIPTextualEncoder(InferenceModel):
depends = []
identity = (ModelType.TEXTUAL, ModelTask.SEARCH)
def _predict(self, inputs: str, **kwargs: Any) -> NDArray[np.float32]:
res: NDArray[np.float32] = self.session.run(None, self.tokenize(inputs))[0][0]
return res
def _load(self) -> ModelSession:
log.debug(f"Loading tokenizer for CLIP model '{self.model_name}'")
self.tokenizer = self._load_tokenizer()
log.debug(f"Loaded tokenizer for CLIP model '{self.model_name}'")
return super()._load()
@abstractmethod
def _load_tokenizer(self) -> Tokenizer:
pass
@abstractmethod
def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
pass
@property
def model_cfg_path(self) -> Path:
return self.cache_dir / "config.json"
@property
def tokenizer_file_path(self) -> Path:
return self.model_dir / "tokenizer.json"
@property
def tokenizer_cfg_path(self) -> Path:
return self.model_dir / "tokenizer_config.json"
@cached_property
def model_cfg(self) -> dict[str, Any]:
log.debug(f"Loading model config for CLIP model '{self.model_name}'")
model_cfg: dict[str, Any] = json.load(self.model_cfg_path.open())
log.debug(f"Loaded model config for CLIP model '{self.model_name}'")
return model_cfg
@cached_property
def tokenizer_file(self) -> dict[str, Any]:
log.debug(f"Loading tokenizer file for CLIP model '{self.model_name}'")
tokenizer_file: dict[str, Any] = json.load(self.tokenizer_file_path.open())
log.debug(f"Loaded tokenizer file for CLIP model '{self.model_name}'")
return tokenizer_file
@cached_property
def tokenizer_cfg(self) -> dict[str, Any]:
log.debug(f"Loading tokenizer config for CLIP model '{self.model_name}'")
tokenizer_cfg: dict[str, Any] = json.load(self.tokenizer_cfg_path.open())
log.debug(f"Loaded tokenizer config for CLIP model '{self.model_name}'")
return tokenizer_cfg
class OpenClipTextualEncoder(BaseCLIPTextualEncoder):
def _load_tokenizer(self) -> Tokenizer:
text_cfg: dict[str, Any] = self.model_cfg["text_cfg"]
context_length: int = text_cfg.get("context_length", 77)
pad_token: str = self.tokenizer_cfg["pad_token"]
tokenizer: Tokenizer = Tokenizer.from_file(self.tokenizer_file_path.as_posix())
pad_id: int = tokenizer.token_to_id(pad_token)
tokenizer.enable_padding(length=context_length, pad_token=pad_token, pad_id=pad_id)
tokenizer.enable_truncation(max_length=context_length)
return tokenizer
def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
tokens: Encoding = self.tokenizer.encode(text)
return {"text": np.array([tokens.ids], dtype=np.int32)}
class MClipTextualEncoder(OpenClipTextualEncoder):
def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
tokens: Encoding = self.tokenizer.encode(text)
return {
"input_ids": np.array([tokens.ids], dtype=np.int32),
"attention_mask": np.array([tokens.attention_mask], dtype=np.int32),
}

View File

@@ -0,0 +1,69 @@
import json
from abc import abstractmethod
from functools import cached_property
from pathlib import Path
from typing import Any
import numpy as np
from numpy.typing import NDArray
from PIL import Image
from app.config import log
from app.models.base import InferenceModel
from app.models.transforms import crop_pil, decode_pil, get_pil_resampling, normalize, resize_pil, to_numpy
from app.schemas import ModelSession, ModelTask, ModelType
class BaseCLIPVisualEncoder(InferenceModel):
depends = []
identity = (ModelType.VISUAL, ModelTask.SEARCH)
def _predict(self, inputs: Image.Image | bytes, **kwargs: Any) -> NDArray[np.float32]:
image = decode_pil(inputs)
res: NDArray[np.float32] = self.session.run(None, self.transform(image))[0][0]
return res
@abstractmethod
def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
pass
@property
def model_cfg_path(self) -> Path:
return self.cache_dir / "config.json"
@property
def preprocess_cfg_path(self) -> Path:
return self.model_dir / "preprocess_cfg.json"
@cached_property
def model_cfg(self) -> dict[str, Any]:
log.debug(f"Loading model config for CLIP model '{self.model_name}'")
model_cfg: dict[str, Any] = json.load(self.model_cfg_path.open())
log.debug(f"Loaded model config for CLIP model '{self.model_name}'")
return model_cfg
@cached_property
def preprocess_cfg(self) -> dict[str, Any]:
log.debug(f"Loading visual preprocessing config for CLIP model '{self.model_name}'")
preprocess_cfg: dict[str, Any] = json.load(self.preprocess_cfg_path.open())
log.debug(f"Loaded visual preprocessing config for CLIP model '{self.model_name}'")
return preprocess_cfg
class OpenClipVisualEncoder(BaseCLIPVisualEncoder):
def _load(self) -> ModelSession:
size: list[int] | int = self.preprocess_cfg["size"]
self.size = size[0] if isinstance(size, list) else size
self.resampling = get_pil_resampling(self.preprocess_cfg["interpolation"])
self.mean = np.array(self.preprocess_cfg["mean"], dtype=np.float32)
self.std = np.array(self.preprocess_cfg["std"], dtype=np.float32)
return super()._load()
def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
image = resize_pil(image, self.size)
image = crop_pil(image, self.size)
image_np = to_numpy(image)
image_np = normalize(image_np, self.mean, self.std)
return {"image": np.expand_dims(image_np.transpose(2, 0, 1), 0)}

View File

@@ -1,4 +1,5 @@
from app.config import clean_name
from app.schemas import ModelSource
_OPENCLIP_MODELS = {
"RN50__openai",
@@ -54,13 +55,16 @@ _INSIGHTFACE_MODELS = {
SUPPORTED_PROVIDERS = ["CUDAExecutionProvider", "OpenVINOExecutionProvider", "CPUExecutionProvider"]
def is_openclip(model_name: str) -> bool:
return clean_name(model_name) in _OPENCLIP_MODELS
def get_model_source(model_name: str) -> ModelSource | None:
cleaned_name = clean_name(model_name)
if cleaned_name in _INSIGHTFACE_MODELS:
return ModelSource.INSIGHTFACE
def is_mclip(model_name: str) -> bool:
return clean_name(model_name) in _MCLIP_MODELS
if cleaned_name in _MCLIP_MODELS:
return ModelSource.MCLIP
if cleaned_name in _OPENCLIP_MODELS:
return ModelSource.OPENCLIP
def is_insightface(model_name: str) -> bool:
return clean_name(model_name) in _INSIGHTFACE_MODELS
return None

View File

@@ -1,90 +0,0 @@
from pathlib import Path
from typing import Any
import cv2
import numpy as np
from insightface.model_zoo import ArcFaceONNX, RetinaFace
from insightface.utils.face_align import norm_crop
from numpy.typing import NDArray
from app.config import clean_name
from app.schemas import Face, ModelType, is_ndarray
from .base import InferenceModel
class FaceRecognizer(InferenceModel):
_model_type = ModelType.FACIAL_RECOGNITION
def __init__(
self,
model_name: str,
min_score: float = 0.7,
cache_dir: Path | str | None = None,
**model_kwargs: Any,
) -> None:
self.min_score = model_kwargs.pop("minScore", min_score)
super().__init__(clean_name(model_name), cache_dir, **model_kwargs)
def _load(self) -> None:
self.det_model = RetinaFace(session=self._make_session(self.det_file))
self.rec_model = ArcFaceONNX(
self.rec_file.with_suffix(".onnx").as_posix(),
session=self._make_session(self.rec_file),
)
self.det_model.prepare(
ctx_id=0,
det_thresh=self.min_score,
input_size=(640, 640),
)
self.rec_model.prepare(ctx_id=0)
def _predict(self, image: NDArray[np.uint8] | bytes) -> list[Face]:
if isinstance(image, bytes):
decoded_image = cv2.imdecode(np.frombuffer(image, np.uint8), cv2.IMREAD_COLOR)
else:
decoded_image = image
assert is_ndarray(decoded_image, np.uint8)
bboxes, kpss = self.det_model.detect(decoded_image)
if bboxes.size == 0:
return []
assert is_ndarray(kpss, np.float32)
scores = bboxes[:, 4].tolist()
bboxes = bboxes[:, :4].round().tolist()
results = []
height, width, _ = decoded_image.shape
for (x1, y1, x2, y2), score, kps in zip(bboxes, scores, kpss):
cropped_img = norm_crop(decoded_image, kps)
embedding: NDArray[np.float32] = self.rec_model.get_feat(cropped_img)[0]
face: Face = {
"imageWidth": width,
"imageHeight": height,
"boundingBox": {
"x1": x1,
"y1": y1,
"x2": x2,
"y2": y2,
},
"score": score,
"embedding": embedding,
}
results.append(face)
return results
@property
def cached(self) -> bool:
return self.det_file.is_file() and self.rec_file.is_file()
@property
def det_file(self) -> Path:
return self.cache_dir / "detection" / f"model.{self.preferred_runtime}"
@property
def rec_file(self) -> Path:
return self.cache_dir / "recognition" / f"model.{self.preferred_runtime}"
def configure(self, **model_kwargs: Any) -> None:
self.det_model.det_thresh = model_kwargs.pop("minScore", self.det_model.det_thresh)

View File

@@ -0,0 +1,48 @@
from pathlib import Path
from typing import Any
import numpy as np
from insightface.model_zoo import RetinaFace
from numpy.typing import NDArray
from app.models.base import InferenceModel
from app.models.transforms import decode_cv2
from app.schemas import FaceDetectionOutput, ModelSession, ModelTask, ModelType
class FaceDetector(InferenceModel):
depends = []
identity = (ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION)
def __init__(
self,
model_name: str,
min_score: float = 0.7,
cache_dir: Path | str | None = None,
**model_kwargs: Any,
) -> None:
self.min_score = model_kwargs.pop("minScore", min_score)
super().__init__(model_name, cache_dir, **model_kwargs)
def _load(self) -> ModelSession:
session = self._make_session(self.model_path)
self.model = RetinaFace(session=session)
self.model.prepare(ctx_id=0, det_thresh=self.min_score, input_size=(640, 640))
return session
def _predict(self, inputs: NDArray[np.uint8] | bytes, **kwargs: Any) -> FaceDetectionOutput:
inputs = decode_cv2(inputs)
bboxes, landmarks = self._detect(inputs)
return {
"boxes": bboxes[:, :4].round(),
"scores": bboxes[:, 4],
"landmarks": landmarks,
}
def _detect(self, inputs: NDArray[np.uint8] | bytes) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
return self.model.detect(inputs) # type: ignore
def configure(self, **kwargs: Any) -> None:
self.model.det_thresh = kwargs.pop("minScore", self.model.det_thresh)

View File

@@ -0,0 +1,77 @@
from pathlib import Path
from typing import Any
import numpy as np
import onnx
import onnxruntime as ort
from insightface.model_zoo import ArcFaceONNX
from insightface.utils.face_align import norm_crop
from numpy.typing import NDArray
from onnx.tools.update_model_dims import update_inputs_outputs_dims
from PIL import Image
from app.config import clean_name, log
from app.models.base import InferenceModel
from app.models.transforms import decode_cv2
from app.schemas import FaceDetectionOutput, FacialRecognitionOutput, ModelSession, ModelTask, ModelType
class FaceRecognizer(InferenceModel):
depends = [(ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION)]
identity = (ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION)
def __init__(
self,
model_name: str,
min_score: float = 0.7,
cache_dir: Path | str | None = None,
**model_kwargs: Any,
) -> None:
self.min_score = model_kwargs.pop("minScore", min_score)
super().__init__(clean_name(model_name), cache_dir, **model_kwargs)
def _load(self) -> ModelSession:
session = self._make_session(self.model_path)
if not self._has_batch_dim(session):
self._add_batch_dim(self.model_path)
session = self._make_session(self.model_path)
self.model = ArcFaceONNX(
self.model_path.with_suffix(".onnx").as_posix(),
session=session,
)
return session
def _predict(
self, inputs: NDArray[np.uint8] | bytes | Image.Image, faces: FaceDetectionOutput, **kwargs: Any
) -> FacialRecognitionOutput:
if faces["boxes"].shape[0] == 0:
return []
inputs = decode_cv2(inputs)
embeddings: NDArray[np.float32] = self.model.get_feat(self._crop(inputs, faces))
return self.postprocess(faces, embeddings)
def postprocess(self, faces: FaceDetectionOutput, embeddings: NDArray[np.float32]) -> FacialRecognitionOutput:
return [
{
"boundingBox": {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
"embedding": embedding,
"score": score,
}
for (x1, y1, x2, y2), embedding, score in zip(faces["boxes"], embeddings, faces["scores"])
]
def _crop(self, image: NDArray[np.uint8], faces: FaceDetectionOutput) -> list[NDArray[np.uint8]]:
return [norm_crop(image, landmark) for landmark in faces["landmarks"]]
def _has_batch_dim(self, session: ort.InferenceSession) -> bool:
return not isinstance(session, ort.InferenceSession) or session.get_inputs()[0].shape[0] == "batch"
def _add_batch_dim(self, model_path: Path) -> None:
log.debug(f"Adding batch dimension to model {model_path}")
proto = onnx.load(model_path)
static_input_dims = [shape.dim_value for shape in proto.graph.input[0].type.tensor_type.shape.dim[1:]]
static_output_dims = [shape.dim_value for shape in proto.graph.output[0].type.tensor_type.shape.dim[1:]]
input_dims = {proto.graph.input[0].name: ["batch"] + static_input_dims}
output_dims = {proto.graph.output[0].name: ["batch"] + static_output_dims}
updated_proto = update_inputs_outputs_dims(proto, input_dims, output_dims)
onnx.save(updated_proto, model_path)

View File

View File

@@ -1,3 +1,7 @@
from io import BytesIO
from typing import IO
import cv2
import numpy as np
from numpy.typing import NDArray
from PIL import Image
@@ -5,7 +9,7 @@ from PIL import Image
_PIL_RESAMPLING_METHODS = {resampling.name.lower(): resampling for resampling in Image.Resampling}
def resize(img: Image.Image, size: int) -> Image.Image:
def resize_pil(img: Image.Image, size: int) -> Image.Image:
if img.width < img.height:
return img.resize((size, int((img.height / img.width) * size)), resample=Image.Resampling.BICUBIC)
else:
@@ -13,7 +17,7 @@ def resize(img: Image.Image, size: int) -> Image.Image:
# https://stackoverflow.com/a/60883103
def crop(img: Image.Image, size: int) -> Image.Image:
def crop_pil(img: Image.Image, size: int) -> Image.Image:
left = int((img.size[0] / 2) - (size / 2))
upper = int((img.size[1] / 2) - (size / 2))
right = left + size
@@ -23,14 +27,36 @@ def crop(img: Image.Image, size: int) -> Image.Image:
def to_numpy(img: Image.Image) -> NDArray[np.float32]:
return np.asarray(img.convert("RGB")).astype(np.float32) / 255.0
return np.asarray(img if img.mode == "RGB" else img.convert("RGB"), dtype=np.float32) / 255.0
def normalize(
img: NDArray[np.float32], mean: float | NDArray[np.float32], std: float | NDArray[np.float32]
) -> NDArray[np.float32]:
return (img - mean) / std
return np.divide(img - mean, std, dtype=np.float32)
def get_pil_resampling(resample: str) -> Image.Resampling:
return _PIL_RESAMPLING_METHODS[resample.lower()]
def pil_to_cv2(image: Image.Image) -> NDArray[np.uint8]:
return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # type: ignore
def decode_pil(image_bytes: bytes | IO[bytes] | Image.Image) -> Image.Image:
if isinstance(image_bytes, Image.Image):
return image_bytes
image = Image.open(BytesIO(image_bytes) if isinstance(image_bytes, bytes) else image_bytes)
image.load() # type: ignore
if not image.mode == "RGB":
image = image.convert("RGB")
return image
def decode_cv2(image_bytes: NDArray[np.uint8] | bytes | Image.Image) -> NDArray[np.uint8]:
if isinstance(image_bytes, bytes):
image_bytes = decode_pil(image_bytes) # pillow is much faster than cv2
if isinstance(image_bytes, Image.Image):
return pil_to_cv2(image_bytes)
return image_bytes