-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_files.py
128 lines (110 loc) · 5.57 KB
/
filter_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/python3
import argparse
import glob
import logging
import os
import sys
import time
from pathlib import Path
import decord
import numpy as np
import torch
from decord import VideoReader, gpu, cpu
from yolov5.models.common import DetectMultiBackend
from yolov5.utils.general import check_img_size
from yolov5.utils.torch_utils import select_device
print(torch.__version__, torch.cuda.is_available())
decord.bridge.set_bridge('torch')
# yolov5 is terrible at configuring logging correctly, so we disable it altogether
logging.disable(sys.maxsize)
parser = argparse.ArgumentParser(description='Filter videos with objects in them.')
parser.add_argument('-i', '--input', type=str, required=True, help='the input folder containing *.mp4 files')
parser.add_argument('-o', '--output', type=str, required=True,
help='the output folder that will contain mp4 files that have objects in them.')
parser.add_argument('-b', '--batch-size', type=int, default=512, required=False,
help='how many frames to batch in one prediction, reduce if they do not fit into RAM or VRAM')
parser.add_argument('-w', '--weights', type=str, default='yolov5s.pt', required=False,
help='the weights path for a YOLOv5 model')
parser.add_argument('-g', '--ignore', type=str, default='', required=False,
help='class names to ignore, comma separated. E.g: car,airplane')
parser.add_argument('-t', '--include', type=str, default='', required=False,
help='class names to include, comma separated. E.g: car,airplane.')
parser.add_argument('-c', '--confidence', type=float, default=0.7, required=False,
help='a fraction between 0 and 1, where 1.0 is really sure this is the object')
parser.add_argument('-dd', '--device', type=str, default="0", required=False,
help="which device to choose, by default the first GPU (0). Can be any number or 'cpu' for CPU")
parser.add_argument('-s', '--img-size', nargs='+', type=int, default=[640, 384], help='inference size w,h')
parser.add_argument('-gd', '--decord-gpu', type=bool, default=False, required=False,
help='use decord GPU, must be built and installed beforehand')
parser.add_argument('-cp', '--class-path', type=bool, default=True, required=False,
help='will move the input mp4 to a folder named after the recognized classes')
parser.add_argument('-sm', '--sampling', type=int, default=1, required=False,
help='uses only every n-th frame to classify')
args = parser.parse_args()
input_path = args.input
output_path = Path(args.output)
weights = args.weights
batch_size = args.batch_size
device = args.device
decord_gpu = args.decord_gpu
confidence = args.confidence
rename_by_class = args.class_path
sampling = args.sampling
imgsz = args.img_size
imgsz *= 2 if len(imgsz) == 1 else 1
ignore_set = set(args.ignore.split(','))
include_set = set(args.include.split(','))
device = select_device(device)
model = DetectMultiBackend(weights, device=device, data="coco128_classes.yml")
stride, names = model.stride, model.names
imgsz = check_img_size(imgsz, s=stride)
def rename_on_class_match(source_path: str, dtc: set):
p = Path(source_path)
dtc = dtc.difference(ignore_set)
if rename_by_class:
if len(dtc) > 0:
print("moving file [%s], detected [%s]" % (p.name, ", ".join(dtc)))
if rename_by_class:
folder = output_path.joinpath("_".join(sorted(dtc)))
os.makedirs(folder, exist_ok=True)
p.rename(folder.joinpath(p.name))
else:
p.rename(output_path.joinpath(p.name))
else:
print("file [%s] has no detections, moving to None" % p.name)
folder = output_path.joinpath("none")
os.makedirs(folder, exist_ok=True)
p.rename(folder.joinpath(p.name))
# TODO(thomas): we could take a timelapse with the detected bounding boxes
def predict_batch(batch) -> set:
start = time.time()
# need normalize between 0-1 and permute into (batch, channel, h, w)
ix = batch.div(255).permute([0, 3, 1, 2]).to(device).float()
# we can skip NMS since we don't care about the bounding boxes at all, thus the transfer back of the result
# from the GPU is only an aggregated set of classes
pred = model(ix)
# score can be found at 4, classes start after index 5 (before that are the bb coords)
predicted_classes = pred[pred[..., 4] > confidence][:, 5:].max(dim=1)[1].unique().cpu()
s = set(map(lambda x: names[x], predicted_classes.numpy()))
print("batch with %d images found [%s], took %s" % (len(batch), ", ".join(s), time.time() - start))
return s
files = glob.glob(input_path + "/*.mp4")
for i in range(len(files)):
file = files[i]
print("processing [%d/%d] [%s]..." % (i + 1, len(files), file))
vr = VideoReader(file, ctx=gpu() if decord_gpu else cpu(), width=imgsz[0], height=imgsz[1])
frame_numbers = np.arange(0, len(vr))
if sampling > 1:
frame_numbers = frame_numbers[frame_numbers % sampling == 0]
ranges = [frame_numbers[i: i + batch_size] for i in range(0, len(frame_numbers), batch_size)]
detected_classes = set()
for r in ranges:
frames = vr.get_batch(r)
detected_classes = detected_classes.union(predict_batch(frames))
# short circuit if we're looking for a specific class that we've found already
if len(detected_classes.intersection(include_set)) > 0:
break
# https://github.com/dmlc/decord/issues/222
del vr
rename_on_class_match(file, detected_classes)
print("done")