import warnings from collections import OrderedDict from functools import partial from typing import Any, Callable, Optional, Union import torch from torch import nn, Tensor from ...ops.misc import Conv2dNormActivation from ...transforms._presets import ObjectDetection from ...utils import _log_api_usage_once from .. import mobilenet from .._api import register_model, Weights, WeightsEnum from .._meta import _COCO_CATEGORIES from .._utils import _ovewrite_value_param, handle_legacy_interface from ..mobilenetv3 import mobilenet_v3_large, MobileNet_V3_Large_Weights from . import _utils as det_utils from .anchor_utils import DefaultBoxGenerator from .backbone_utils import _validate_trainable_layers from .ssd import SSD, SSDScoringHead __all__ = [ "SSDLite320_MobileNet_V3_Large_Weights", "ssdlite320_mobilenet_v3_large", ] # Building blocks of SSDlite as described in section 6.2 of MobileNetV2 paper def _prediction_block( in_channels: int, out_channels: int, kernel_size: int, norm_layer: Callable[..., nn.Module] ) -> nn.Sequential: return nn.Sequential( # 3x3 depthwise with stride 1 and padding 1 Conv2dNormActivation( in_channels, in_channels, kernel_size=kernel_size, groups=in_channels, norm_layer=norm_layer, activation_layer=nn.ReLU6, ), # 1x1 projetion to output channels nn.Conv2d(in_channels, out_channels, 1), ) def _extra_block(in_channels: int, out_channels: int, norm_layer: Callable[..., nn.Module]) -> nn.Sequential: activation = nn.ReLU6 intermediate_channels = out_channels // 2 return nn.Sequential( # 1x1 projection to half output channels Conv2dNormActivation( in_channels, intermediate_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=activation ), # 3x3 depthwise with stride 2 and padding 1 Conv2dNormActivation( intermediate_channels, intermediate_channels, kernel_size=3, stride=2, groups=intermediate_channels, norm_layer=norm_layer, activation_layer=activation, ), # 1x1 projetion to output channels Conv2dNormActivation( intermediate_channels, out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=activation ), ) def _normal_init(conv: nn.Module): for layer in conv.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0.0, std=0.03) if layer.bias is not None: torch.nn.init.constant_(layer.bias, 0.0) class SSDLiteHead(nn.Module): def __init__( self, in_channels: list[int], num_anchors: list[int], num_classes: int, norm_layer: Callable[..., nn.Module] ): super().__init__() self.classification_head = SSDLiteClassificationHead(in_channels, num_anchors, num_classes, norm_layer) self.regression_head = SSDLiteRegressionHead(in_channels, num_anchors, norm_layer) def forward(self, x: list[Tensor]) -> dict[str, Tensor]: return { "bbox_regression": self.regression_head(x), "cls_logits": self.classification_head(x), } class SSDLiteClassificationHead(SSDScoringHead): def __init__( self, in_channels: list[int], num_anchors: list[int], num_classes: int, norm_layer: Callable[..., nn.Module] ): cls_logits = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): cls_logits.append(_prediction_block(channels, num_classes * anchors, 3, norm_layer)) _normal_init(cls_logits) super().__init__(cls_logits, num_classes) class SSDLiteRegressionHead(SSDScoringHead): def __init__(self, in_channels: list[int], num_anchors: list[int], norm_layer: Callable[..., nn.Module]): bbox_reg = nn.ModuleList() for channels, anchors in zip(in_channels, num_anchors): bbox_reg.append(_prediction_block(channels, 4 * anchors, 3, norm_layer)) _normal_init(bbox_reg) super().__init__(bbox_reg, 4) class SSDLiteFeatureExtractorMobileNet(nn.Module): def __init__( self, backbone: nn.Module, c4_pos: int, norm_layer: Callable[..., nn.Module], width_mult: float = 1.0, min_depth: int = 16, ): super().__init__() _log_api_usage_once(self) if backbone[c4_pos].use_res_connect: raise ValueError("backbone[c4_pos].use_res_connect should be False") self.features = nn.Sequential( # As described in section 6.3 of MobileNetV3 paper nn.Sequential(*backbone[:c4_pos], backbone[c4_pos].block[0]), # from start until C4 expansion layer nn.Sequential(backbone[c4_pos].block[1:], *backbone[c4_pos + 1 :]), # from C4 depthwise until end ) get_depth = lambda d: max(min_depth, int(d * width_mult)) # noqa: E731 extra = nn.ModuleList( [ _extra_block(backbone[-1].out_channels, get_depth(512), norm_layer), _extra_block(get_depth(512), get_depth(256), norm_layer), _extra_block(get_depth(256), get_depth(256), norm_layer), _extra_block(get_depth(256), get_depth(128), norm_layer), ] ) _normal_init(extra) self.extra = extra def forward(self, x: Tensor) -> dict[str, Tensor]: # Get feature maps from backbone and extra. Can't be refactored due to JIT limitations. output = [] for block in self.features: x = block(x) output.append(x) for block in self.extra: x = block(x) output.append(x) return OrderedDict([(str(i), v) for i, v in enumerate(output)]) def _mobilenet_extractor( backbone: Union[mobilenet.MobileNetV2, mobilenet.MobileNetV3], trainable_layers: int, norm_layer: Callable[..., nn.Module], ): backbone = backbone.features # Gather the indices of blocks which are strided. These are the locations of C1, ..., Cn-1 blocks. # The first and last blocks are always included because they are the C0 (conv1) and Cn. stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1] num_stages = len(stage_indices) # find the index of the layer from which we won't freeze if not 0 <= trainable_layers <= num_stages: raise ValueError("trainable_layers should be in the range [0, {num_stages}], instead got {trainable_layers}") freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers] for b in backbone[:freeze_before]: for parameter in b.parameters(): parameter.requires_grad_(False) return SSDLiteFeatureExtractorMobileNet(backbone, stage_indices[-2], norm_layer) class SSDLite320_MobileNet_V3_Large_Weights(WeightsEnum): COCO_V1 = Weights( url="https://download.pytorch.org/models/ssdlite320_mobilenet_v3_large_coco-a79551df.pth", transforms=ObjectDetection, meta={ "num_params": 3440060, "categories": _COCO_CATEGORIES, "min_size": (1, 1), "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#ssdlite320-mobilenetv3-large", "_metrics": { "COCO-val2017": { "box_map": 21.3, } }, "_ops": 0.583, "_file_size": 13.418, "_docs": """These weights were produced by following a similar training recipe as on the paper.""", }, ) DEFAULT = COCO_V1 @register_model() @handle_legacy_interface( weights=("pretrained", SSDLite320_MobileNet_V3_Large_Weights.COCO_V1), weights_backbone=("pretrained_backbone", MobileNet_V3_Large_Weights.IMAGENET1K_V1), ) def ssdlite320_mobilenet_v3_large( *, weights: Optional[SSDLite320_MobileNet_V3_Large_Weights] = None, progress: bool = True, num_classes: Optional[int] = None, weights_backbone: Optional[MobileNet_V3_Large_Weights] = MobileNet_V3_Large_Weights.IMAGENET1K_V1, trainable_backbone_layers: Optional[int] = None, norm_layer: Optional[Callable[..., nn.Module]] = None, **kwargs: Any, ) -> SSD: """SSDlite model architecture with input size 320x320 and a MobileNetV3 Large backbone, as described at `Searching for MobileNetV3 `__ and `MobileNetV2: Inverted Residuals and Linear Bottlenecks `__. .. betastatus:: detection module See :func:`~torchvision.models.detection.ssd300_vgg16` for more details. Example: >>> model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(weights=SSDLite320_MobileNet_V3_Large_Weights.DEFAULT) >>> model.eval() >>> x = [torch.rand(3, 320, 320), torch.rand(3, 500, 400)] >>> predictions = model(x) Args: weights (:class:`~torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights`, optional): The pretrained weights to use. See :class:`~torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights` below for more details, and possible values. By default, no pre-trained weights are used. progress (bool, optional): If True, displays a progress bar of the download to stderr. Default is True. num_classes (int, optional): number of output classes of the model (including the background). weights_backbone (:class:`~torchvision.models.MobileNet_V3_Large_Weights`, optional): The pretrained weights for the backbone. trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from final block. Valid values are between 0 and 6, with 6 meaning all backbone layers are trainable. If ``None`` is passed (the default) this value is set to 6. norm_layer (callable, optional): Module specifying the normalization layer to use. **kwargs: parameters passed to the ``torchvision.models.detection.ssd.SSD`` base class. Please refer to the `source code `_ for more details about this class. .. autoclass:: torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights :members: """ weights = SSDLite320_MobileNet_V3_Large_Weights.verify(weights) weights_backbone = MobileNet_V3_Large_Weights.verify(weights_backbone) if "size" in kwargs: warnings.warn("The size of the model is already fixed; ignoring the parameter.") if weights is not None: weights_backbone = None num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"])) elif num_classes is None: num_classes = 91 trainable_backbone_layers = _validate_trainable_layers( weights is not None or weights_backbone is not None, trainable_backbone_layers, 6, 6 ) # Enable reduced tail if no pretrained backbone is selected. See Table 6 of MobileNetV3 paper. reduce_tail = weights_backbone is None if norm_layer is None: norm_layer = partial(nn.BatchNorm2d, eps=0.001, momentum=0.03) backbone = mobilenet_v3_large( weights=weights_backbone, progress=progress, norm_layer=norm_layer, reduced_tail=reduce_tail, **kwargs ) if weights_backbone is None: # Change the default initialization scheme if not pretrained _normal_init(backbone) backbone = _mobilenet_extractor( backbone, trainable_backbone_layers, norm_layer, ) size = (320, 320) anchor_generator = DefaultBoxGenerator([[2, 3] for _ in range(6)], min_ratio=0.2, max_ratio=0.95) out_channels = det_utils.retrieve_out_channels(backbone, size) num_anchors = anchor_generator.num_anchors_per_location() if len(out_channels) != len(anchor_generator.aspect_ratios): raise ValueError( f"The length of the output channels from the backbone {len(out_channels)} do not match the length of the anchor generator aspect ratios {len(anchor_generator.aspect_ratios)}" ) defaults = { "score_thresh": 0.001, "nms_thresh": 0.55, "detections_per_img": 300, "topk_candidates": 300, # Rescale the input in a way compatible to the backbone: # The following mean/std rescale the data from [0, 1] to [-1, 1] "image_mean": [0.5, 0.5, 0.5], "image_std": [0.5, 0.5, 0.5], } kwargs: Any = {**defaults, **kwargs} model = SSD( backbone, anchor_generator, size, num_classes, head=SSDLiteHead(out_channels, num_anchors, num_classes, norm_layer), **kwargs, ) if weights is not None: model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True)) return model