In this article, I will summarize my journey to webcam image and audio processing.
First the code of the project on GitHub
Image processing
For image processing, we use cv2 module.
Test webcam
import cv2video = cv2.VideoCapture(1)#Change device number if needed
fps = int(video.get(cv2.CAP_PROP_FPS))
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(fps, width, height)while True:
grabbed, frame = video.read()
print("====New frame====")
frame = cv2.resize(frame, (width//2, height//2))
cv2.imshow("Video", frame)
if cv2.waitKey(2) & 0xFF == ord("q"):
break
Source: https://github.com/aruno14/webcamProcessing/blob/main/test_webcam.py
Detect face and create mean image
The mean image is created using the average pixel value of all detected faces during time. In next section, we will use this image as input of our machine prediction model.
import cv2
import time
import numpy as np
from PIL import Imagevideo = cv2.VideoCapture(1)
fps = int(video.get(cv2.CAP_PROP_FPS))
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(fps, width, height)
faceSize = 150face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')frameCount = 0
faceMean = np.zeros((faceSize, faceSize, 3), np.uint8)
faceMeanCount = 0
while True:
grabbed, frame = video.read()
frameCount+=1
print("====Frame", frameCount, "====")
outputFrame = np.zeros((height, width + faceSize, 3), np.uint8)
frame = cv2.resize(frame, (width, height))
start_time = time.time()
faces = face_cascade.detectMultiScale(frame, scaleFactor=1.5, minNeighbors=5)for (x,y,w,h) in faces:
face = cv2.resize(frame[y:y+h, x:x+w], (faceSize, faceSize))
faceMean = np.average([faceMean, face], axis=0, weights=[faceMeanCount, 1])
faceMeanCount+=1
outputFrame[0:150, width:width+150] = faceMean
cv2.rectangle(frame, (x,y), (x+w,y+h), (255, 0, 0), 2)elapsed_ms = (time.time() - start_time) * 1000
outputFrame[0:height,0:width] = frame
cv2.imshow("Video", outputFrame)
if cv2.waitKey(2) & 0xFF == ord("q"):
break
Source: https://github.com/aruno14/webcamProcessing/blob/main/test_webcam_face_mean.py
Then, we use three models to predict Gender, Age and Emotion
We use the mean face image for prediction.
Since code is a little long, I do not past it here; check below GitHub page:
Source: https://github.com/aruno14/webcamProcessing/blob/main/test_webcam_face_ml.py
Audio Processing
For audio processing, we use pyaudio module.
Test mic
import time
import numpy as np
import pyaudio
import structCHUNK = 44100
FORMAT = pyaudio.paFloat32
format_max = 32767
CHANNELS, RATE = 1, 44100
decoded_data = []p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
print("* recording")start_time = time.time()
while True:
data = stream.read(CHUNK)
end_time = time.time()
elapsed_ms = (end_time - start_time) * 1000
print("elapsed_ms", elapsed_ms)
start_time = end_time count = len(data)/2
format = "%dh"%(count)
shorts = struct.unpack(format, data)
newData = np.asarray(shorts)/32767#Normalized between -1~1
meanNoise = np.mean(np.square(newData))
print("Sound:", meanNoise)
decoded_data = np.concatenate([decoded_data, newData], axis=-1)print("* done recording")
stream.stop_stream()
stream.close()
p.terminate()
Source: https://github.com/aruno14/webcamProcessing/blob/main/test_mic.py
Then, we use two models to predict Gender and Age
Since code is a little long, I do not past it here; check below GitHub page:
Source: https://github.com/aruno14/webcamProcessing/blob/main/test_mic_ml.py
Image and Audio Processing
Finally, using threading we execute image and audio processing at the same time.
Source: https://github.com/aruno14/webcamProcessing/blob/main/test_webcam_mic_ml.py