Numerical feats
Categorical feats
Image feats
SIFT (technique used in feature descriptors)
import cv2
import numpy as np
# Initialise SIFT detector
sift = cv2.SIFT_create()
# key points and descriptors with SIFT
kp1, des1 = sift.detectAndCompute(img1, None)
kp2, des2 = sift.detectAndCompute(img2, None)
# find matches using k nearest neighbors
bf = cv2.BFMatcher()
matches = bf.knnMatch(des1, des2, k=2)
# apply ratio test to threshold the best matches
good = []
for m, n in matches:
if m.distance < 0.75 * n.distance:
good.append([m])
# draw the matches
img3 = cv2.drawMatchesKnn(
img1, kp1, img2, kp2, good, None, flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS
)
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
FLANN_INDEX_LSH = 6
index_params = dict(
algorithm=FLANN_INDEX_LSH, table_number=12, key_size=20, multi_probe_level=2
)
search_params = dict(checks=50)
sift = cv2.SIFT_create()
kp1, des1 = sift.detectAndCompute(img1, None)
kp2, des2 = sift.detectAndCompute(img2, None)
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=50)
flann = cv2.FlannBasedMatcher(index_params, search_params)
matches = flann.knnMatch(des1, des2, k=2)
matchesMask = [[0, 0] for i in range(len(matches))]
for i, (m, n) in enumerate(matches):
if m.distance < 0.7 * n.distance:
matchesMask[i] = [1, 0]
draw_params = dict(
matchColor=(0, 255, 0),
singlePointColor=(255, 0, 0),
matchesMask=matchesMask,
flags=cv2.DrawMatchesFlags_DEFAULT,
)
img3 = cv2.drawMatchesKnn(img1, kp1, img2, kp2, matches, None, **draw_params)
import torch.nn as nn
class VGG19(nn.Module):
def __init__(self, num_classes=1000):
super(VGG19, self).__init__()
# Feature extraction layers: Convolutional and pooling layers
self.feature_extractor = nn.Sequential(
nn.Conv2d(
3, 64, kernel_size=3, padding=1
), # 3 input channels, 64 output channels, 3x3 kernel, 1 padding
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(
kernel_size=2, stride=2
), # Max pooling with 2x2 kernel and stride 2
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
)
# Fully connected layers for classification
self.classifier = nn.Sequential(
nn.Linear(
512 * 7 * 7, 4096
), # 512 channels, 7x7 spatial dimensions after max pooling
nn.ReLU(),
nn.Dropout(0.5), # Dropout layer with 0.5 dropout probability
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, num_classes), # Output layer with 'num_classes' output units
)
def forward(self, x):
x = self.feature_extractor(x) # Pass input through the feature extractor layers
x = x.view(x.size(0), -1) # Flatten the output for the fully connected layers
x = self.classifier(x) # Pass flattened output through the classifier layers
return x
import torch
import torch.nn as nn
class BaseConv2d(nn.Module):
def __init__(self, in_channels, out_channels, **kwargs):
super(BaseConv2d, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
self.relu = nn.ReLU()
def forward(self, x):
x = self.conv(x)
x = self.relu(x)
return x
class InceptionModule(nn.Module):
def __init__(self, in_channels, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_proj):
super(InceptionModule, self).__init__()
self.b1 = BaseConv2d(in_channels, n1x1, kernel_size=1)
self.b2 = nn.Sequential(
BaseConv2d(in_channels, n3x3red, kernel_size=1),
BaseConv2d(n3x3red, n3x3, kernel_size=3, padding=1),
)
self.b3 = nn.Sequential(
BaseConv2d(in_channels, n5x5red, kernel_size=1),
BaseConv2d(n5x5red, n5x5, kernel_size=5, padding=2),
)
self.b4 = nn.Sequential(
nn.MaxPool2d(3, stride=1, padding=1),
BaseConv2d(in_channels, pool_proj, kernel_size=1),
)
def forward(self, x):
y1 = self.b1(x)
y2 = self.b2(x)
y3 = self.b3(x)
y4 = self.b4(x)
return torch.cat([y1, y2, y3, y4], 1)
class AuxiliaryClassifier(nn.Module):
def __init__(self, in_channels, num_classes, dropout=0.7):
super(AuxiliaryClassifier, self).__init__()
self.pool = nn.AvgPool2d(5, stride=3)
self.conv = BaseConv2d(in_channels, 128, kernel_size=1)
self.relu = nn.ReLU(True)
self.flatten = nn.Flatten()
self.fc1 = nn.Linear(2048, 1024)
self.dropout = nn.Dropout(dropout)
self.fc2 = nn.Linear(1024, num_classes)
def forward(self, x):
x = self.pool(x)
x = self.conv(x)
x = self.flatten(x)
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return x
class GoogLeNet(nn.Module):
def __init__(self, use_aux=True):
super(GoogLeNet, self).__init__()
self.use_aux = use_aux
## block 1
self.conv1 = BaseConv2d(3, 64, kernel_size=7, stride=2, padding=3)
self.lrn1 = nn.LocalResponseNorm(5, alpha=0.0001, beta=0.75)
self.maxpool1 = nn.MaxPool2d(3, stride=2, padding=1)
## block 2
self.conv2 = BaseConv2d(64, 64, kernel_size=1)
self.conv3 = BaseConv2d(64, 192, kernel_size=3, padding=1)
self.lrn2 = nn.LocalResponseNorm(5, alpha=0.0001, beta=0.75)
self.maxpool2 = nn.MaxPool2d(3, stride=2, padding=1)
## block 3
self.inception3a = InceptionModule(192, 64, 96, 128, 16, 32, 32)
self.inception3b = InceptionModule(256, 128, 128, 192, 32, 96, 64)
self.maxpool3 = nn.MaxPool2d(3, stride=2, padding=1)
## block 4
self.inception4a = InceptionModule(480, 192, 96, 208, 16, 48, 64)
self.inception4b = InceptionModule(512, 160, 112, 224, 24, 64, 64)
self.inception4c = InceptionModule(512, 128, 128, 256, 24, 64, 64)
self.inception4d = InceptionModule(512, 112, 144, 288, 32, 64, 64)
self.inception4e = InceptionModule(528, 256, 160, 320, 32, 128, 128)
self.maxpool4 = nn.MaxPool2d(3, stride=2, padding=1)
## block 5
self.inception5a = InceptionModule(832, 256, 160, 320, 32, 128, 128)
self.inception5b = InceptionModule(832, 384, 192, 384, 48, 128, 128)
## auxiliary classifier
if self.use_aux:
self.aux1 = AuxiliaryClassifier(512, 1000)
self.aux2 = AuxiliaryClassifier(528, 1000)
## block 6
self.avgpool = nn.AvgPool2d(7, stride=1)
self.dropout = nn.Dropout(0.4)
self.fc = nn.Linear(1024, 1000)
def forward(self, x):
## block 1
x = self.conv1(x)
x = self.maxpool1(x)
x = self.lrn1(x)
## block 2
x = self.conv2(x)
x = self.conv3(x)
x = self.lrn2(x)
x = self.maxpool2(x)
## block 3
x = self.inception3a(x)
x = self.inception3b(x)
x = self.maxpool3(x)
## block 4
x = self.inception4a(x)
if self.use_aux:
aux1 = self.aux1(x)
x = self.inception4b(x)
x = self.inception4c(x)
x = self.inception4d(x)
if self.use_aux:
aux2 = self.aux2(x)
x = self.inception4e(x)
x = self.maxpool4(x)
## block 5
x = self.inception5a(x)
x = self.inception5b(x)
## block 6
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.dropout(x)
x = self.fc(x)
if self.use_aux:
return x, aux1, aux2
else:
return x
import torch
import torch.nn as nn
import torch.nn.functional as F
class DepthwiseSeparableConv(nn.Module):
def __init__(self, in_channels, out_channels, stride):
super().__init__()
self.depthwise = nn.Conv2d(
in_channels,
in_channels,
kernel_size=3,
stride=stride,
padding=1,
groups=in_channels,
)
self.pointwise = nn.Conv2d(
in_channels, out_channels, kernel_size=1, stride=1, padding=0
)
def forward(self, x):
x = self.depthwise(x)
x = self.pointwise(x)
return x
class MobileNet(nn.Module):
def __init__(self, num_classes=1000):
super().__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1)
# MobileNet body
self.dw_conv2 = DepthwiseSeparableConv(32, 64, 1)
self.dw_conv3 = DepthwiseSeparableConv(64, 128, 2)
self.dw_conv4 = DepthwiseSeparableConv(128, 128, 1)
self.dw_conv5 = DepthwiseSeparableConv(128, 256, 2)
self.dw_conv6 = DepthwiseSeparableConv(256, 256, 1)
self.dw_conv7 = DepthwiseSeparableConv(256, 512, 2)
# 5 depthwise separable convolutions with stride 1
self.dw_conv8 = DepthwiseSeparableConv(512, 512, 1)
self.dw_conv9 = DepthwiseSeparableConv(512, 512, 1)
self.dw_conv10 = DepthwiseSeparableConv(512, 512, 1)
self.dw_conv11 = DepthwiseSeparableConv(512, 512, 1)
self.dw_conv12 = DepthwiseSeparableConv(512, 512, 1)
self.dw_conv13 = DepthwiseSeparableConv(512, 1024, 2)
self.dw_conv14 = DepthwiseSeparableConv(1024, 1024, 1)
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(1024, num_classes)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.dw_conv2(x)
x = F.relu(x)
x = self.dw_conv3(x)
x = F.relu(x)
x = self.dw_conv4(x)
x = F.relu(x)
x = self.dw_conv5(x)
x = F.relu(x)
x = self.dw_conv6(x)
x = F.relu(x)
x = self.dw_conv7(x)
x = F.relu(x)
x = self.dw_conv8(x)
x = F.relu(x)
x = self.dw_conv9(x)
x = F.relu(x)
x = self.dw_conv10(x)
x = F.relu(x)
x = self.dw_conv11(x)
x = F.relu(x)
x = self.dw_conv12(x)
x = F.relu(x)
x = self.dw_conv13(x)
x = F.relu(x)
x = self.dw_conv14(x)
x = F.relu(x)
x = self.avg_pool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# Create the model
mobilenet = MobileNet(num_classes=1000)
print(mobilenet)
pip install timm
import timm
import torch
# Load a pre-trained MobileNet model
model_name = "mobilenetv3_large_100"
model = timm.create_model(model_name, pretrained=True)
# If you want to use the model for inference
model.eval()
# Forward pass with a dummy input
# Batch size 1, 3 color channels, 224x224 image
input_tensor = torch.rand(1, 3, 224, 224)
output = model(input_tensor)
print(output)
from transformers import AutoFeatureExtractor, ResNetForImageClassification
import torch
from datasets import load_dataset
dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-50")
model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")
inputs = feature_extractor(image, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
# model predicts one of the 1000 ImageNet classes
predicted_label = logits.argmax(-1).item()
print(model.config.id2label[predicted_label])
from datasets import load_dataset
from transformers import AutoImageProcessor, SwinForImageClassification
import torch
model = SwinForImageClassification.from_pretrained(
"microsoft/swin-tiny-patch4-window7-224"
)
image_processor = AutoImageProcessor.from_pretrained(
"microsoft/swin-tiny-patch4-window7-224"
)
dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]
inputs = image_processor(image, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
predicted_label_id = logits.argmax(-1).item()
predicted_label_text = model.config.id2label[predicted_label_id]
print(predicted_label_text)
window_size
, ape
and fused_window_process
)from datasets import load_dataset
from transformers import AutoImageProcessor, SwinForImageClassification
import torch
model = SwinForImageClassification.from_pretrained(
"microsoft/swin-tiny-patch4-window7-224"
)
image_processor = AutoImageProcessor.from_pretrained(
"microsoft/swin-tiny-patch4-window7-224"
)
dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]
inputs = image_processor(image, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
predicted_label_id = logits.argmax(-1).item()
predicted_label_text = model.config.id2label[predicted_label_id]
print(predicted_label_text)
from collections import OrderedDict
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from einops.layers.torch import Rearrange
def _build_projection(self, dim_in, dim_out, kernel_size, padding, stride, method):
if method == "dw_bn":
proj = nn.Sequential(
OrderedDict(
[
(
"conv",
nn.Conv2d(
dim_in,
dim_in,
kernel_size=kernel_size,
padding=padding,
stride=stride,
bias=False,
groups=dim_in,
),
),
("bn", nn.BatchNorm2d(dim_in)),
("rearrage", Rearrange("b c h w -> b (h w) c")),
]
)
)
elif method == "avg":
proj = nn.Sequential(
OrderedDict(
[
(
"avg",
nn.AvgPool2d(
kernel_size=kernel_size,
padding=padding,
stride=stride,
ceil_mode=True,
),
),
("rearrage", Rearrange("b c h w -> b (h w) c")),
]
)
)
elif method == "linear":
proj = None
else:
raise ValueError("Unknown method ({})".format(method))
return proj
class ConvEmbed(nn.Module):
def __init__(
self, patch_size=7, in_chans=3, embed_dim=64, stride=4, padding=2, norm_layer=None
):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.proj = nn.Conv2d(
in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding
)
self.norm = norm_layer(embed_dim) if norm_layer else None
def forward(self, x):
x = self.proj(x)
B, C, H, W = x.shape
x = rearrange(x, "b c h w -> b (h w) c")
if self.norm:
x = self.norm(x)
x = rearrange(x, "b (h w) c -> b c h w", h=H, w=W)
return x
from transformers import AutoImageProcessor, DinatForImageClassification
from PIL import Image
import requests
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
feature_extractor = AutoImageProcessor.from_pretrained("shi-labs/dinat-mini-in1k-224")
model = DinatForImageClassification.from_pretrained("shi-labs/dinat-mini-in1k-224")
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])
import fetch from "node-fetch";
import fs from "fs";
async function query(filename) {
const data = fs.readFileSync(filename);
const response = await fetch(
"https://api-inference.huggingface.co/models/apple/mobilevitv2-1.0-imagenet1k-256",
{
headers: { Authorization: `Bearer ${API_TOKEN}` },
method: "POST",
body: data,
}
);
const result = await response.json();
return result;
}
query("cats.jpg").then((response) => {
console.log(JSON.stringify(response));
});
!pip install -U -q datasets transformers[torch] evaluate timm albumentations accelerate
import numpy as np
from PIL import Image, ImageDraw
def draw_image_from_idx(dataset, idx):
sample = dataset[idx]
image = sample["image"]
annotations = sample["objects"]
draw = ImageDraw.Draw(image)
width, height = sample["width"], sample["height"]
for i in range(len(annotations["id"])):
box = annotations["bbox"][i]
class_idx = annotations["id"][i]
x, y, w, h = tuple(box)
if max(box) > 1.0:
x1, y1 = int(x), int(y)
x2, y2 = int(x + w), int(y + h)
else:
x1 = int(x * width)
y1 = int(y * height)
x2 = int((x + w) * width)
y2 = int((y + h) * height)
draw.rectangle((x1, y1, x2, y2), outline="red", width=1)
draw.text((x1, y1), annotations["category"][i], fill="white")
return image
draw_image_from_idx(dataset=train_dataset, idx=10)
import matplotlib.pyplot as plt
def plot_images(dataset, indices):
"""
Plot images and their annotations.
"""
num_rows = len(indices) // 3
num_cols = 3
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))
for i, idx in enumerate(indices):
row = i // num_cols
col = i % num_cols
# Draw image
image = draw_image_from_idx(dataset, idx)
# Display image on the corresponding subplot
axes[row, col].imshow(image)
axes[row, col].axis("off")
plt.tight_layout()
plt.show()
# Now use the function to plot images
plot_images(train_dataset, range(9))
import albumentations
import numpy as np
import torch
transform = albumentations.Compose(
[
albumentations.Resize(480, 480),
albumentations.HorizontalFlip(p=1.0),
albumentations.RandomBrightnessContrast(p=1.0),
],
bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)
# transforming a batch
def transform_aug_ann(examples):
image_ids = examples["image_id"]
images, bboxes, area, categories = [], [], [], []
for image, objects in zip(examples["image"], examples["objects"]):
image = np.array(image.convert("RGB"))[:, :, ::-1]
out = transform(image=image, bboxes=objects["bbox"], category=objects["id"])
area.append(objects["area"])
images.append(out["image"])
bboxes.append(out["bboxes"])
categories.append(out["category"])
targets = [
{"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
]
return image_processor(images=images, annotations=targets, return_tensors="pt")
[dimension, height, width]
is downsized and then flattened to [dimension, less than height x width]
import torch
from torch import nn
from torchvision.models import resnet50
class DETR(nn.Module):
def __init__(
self, num_classes, hidden_dim, nheads, num_encoder_layers, num_decoder_layers
):
super().__init__()
self.backbone = nn.Sequential(*list(resnet50(pretrained=True).children())[:-2])
self.conv = nn.Conv2d(2048, hidden_dim, 1)
self.transformer = nn.Transformer(
hidden_dim, nheads, num_encoder_layers, num_decoder_layers
)
self.linear_class = nn.Linear(hidden_dim, num_classes + 1)
self.linear_bbox = nn.Linear(hidden_dim, 4)
self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))
self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
def forward(self, inputs):
x = self.backbone(inputs)
h = self.conv(x)
H, W = h.shape[-2:]
pos = (
torch.cat(
[
self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
],
dim=-1,
)
.flatten(0, 1)
.unsqueeze(1)
)
h = self.transformer(
pos + h.flatten(2).permute(2, 0, 1), self.query_pos.unsqueeze(1)
)
return self.linear_class(h), self.linear_bbox(h).sigmoid()
from transformers import pipeline
from PIL import Image
import requests
segmentation = pipeline("image-segmentation", "facebook/maskformer-swin-base-coco")
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
results = segmentation(images=image, subtask="panoptic")
results
!pip install -q natten
from transformers import OneFormerProcessor, OneFormerForUniversalSegmentation
from PIL import Image
import requests
import matplotlib.pyplot as plt
def run_segmentation(image, task_type):
"""Performs image segmentation based on the given task type.
Args:
image (PIL.Image): The input image.
task_type (str): The type of segmentation to perform ('semantic', 'instance', or 'panoptic').
Returns:
PIL.Image: The segmented image.
Raises:
ValueError: If the task type is invalid.
"""
processor = OneFormerProcessor.from_pretrained(
"shi-labs/oneformer_ade20k_dinat_large"
) # Load once here
model = OneFormerForUniversalSegmentation.from_pretrained(
"shi-labs/oneformer_ade20k_dinat_large"
)
if task_type == "semantic":
inputs = processor(images=image, task_inputs=["semantic"], return_tensors="pt")
outputs = model(**inputs)
predicted_map = processor.post_process_semantic_segmentation(
outputs, target_sizes=[image.size[::-1]]
)[0]
elif task_type == "instance":
inputs = processor(images=image, task_inputs=["instance"], return_tensors="pt")
outputs = model(**inputs)
predicted_map = processor.post_process_instance_segmentation(
outputs, target_sizes=[image.size[::-1]]
)[0]["segmentation"]
elif task_type == "panoptic":
inputs = processor(images=image, task_inputs=["panoptic"], return_tensors="pt")
outputs = model(**inputs)
predicted_map = processor.post_process_panoptic_segmentation(
outputs, target_sizes=[image.size[::-1]]
)[0]["segmentation"]
else:
raise ValueError(
"Invalid task type. Choose from 'semantic', 'instance', or 'panoptic'"
)
return predicted_map
def show_image_comparison(image, predicted_map, segmentation_title):
"""Displays the original image and the segmented image side-by-side.
Args:
image (PIL.Image): The original image.
predicted_map (PIL.Image): The segmented image.
segmentation_title (str): The title for the segmented image.
"""
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(image)
plt.title("Original Image")
plt.axis("off")
plt.subplot(1, 2, 2)
plt.imshow(predicted_map)
plt.title(segmentation_title + " Segmentation")
plt.axis("off")
plt.show()
url = "https://huggingface.co/datasets/shi-labs/oneformer_demo/resolve/main/ade20k.jpeg"
response = requests.get(url, stream=True)
response.raise_for_status() # Check for HTTP errors
image = Image.open(response.raw)
task_to_run = "semantic"
predicted_map = run_segmentation(image, task_to_run)
show_image_comparison(image, predicted_map, task_to_run)