# Python机器视觉之基于OpenCV的手势检测

## 2 传统机器视觉的手势检测

### 2.1 轮廓检测法

```int Mat::checkVector(int _elemChannels, int _depth, bool _requireContinuous) const

{

return (depth() == _depth || _depth <= 0) &&

(isContinuous() || !_requireContinuous) &&

((dims == 2 && (((rows == 1 || cols == 1) && channels() == _elemChannels) || (cols == _elemChannels))) ||

(dims == 3 && channels() == 1 && size.p[2] == _elemChannels && (size.p[0] == 1 || size.p[1] == 1) &&

(isContinuous() || step.p[1] == step.p[2]*size.p[2])))

? (int)(total()*channels()/_elemChannels) : -1;

}
```

### 2.3 整体代码实现

2.3.1 算法流程

1. 求出手部的掩膜

2. 求出掩膜的轮廓

3. 求出轮廓的多变形拟合曲线

4. 求出多边形拟合曲线的凸包集，找出凸点

5. 求出多变形拟合曲线的凹陷集，找出凹点

6. 利用上面的凸凹点和手部中心点的几何关系来做简单的数字手势识别

(这里用的是C语言写的，这个代码是学长早期写的，同学们需要的话，学长出一个python版本的)

```#include <iostream>

#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include <opencv2/core/core.hpp>
#include "copenni.cpp"

#include <iostream>

#define DEPTH_SCALE_FACTOR 255./4096.
#define ROI_HAND_WIDTH 140
#define ROI_HAND_HEIGHT 140
#define MEDIAN_BLUR_K 5
#define XRES  640
#define YRES  480
#define DEPTH_SEGMENT_THRESH 5
#define MAX_HANDS_COLOR 10
#define MAX_HANDS_NUMBER  10
#define HAND_LIKELY_AREA 2000
#define DELTA_POINT_DISTENCE 25     //手部中心点1和中心点2距离的阈值
#define SEGMENT_POINT1_DISTANCE 27  //凸点与手部中心点1远近距离的阈值
#define SEGMENT_POINT2_DISTANCE 30  //凸点与手部中心点2远近距离的阈值

using namespace cv;
using namespace xn;
using namespace std;

int main (int argc, char **argv)
{
unsigned int convex_number_above_point1 = 0;
unsigned int concave_number_above_point1 = 0;
unsigned int convex_number_above_point2 = 0;
unsigned int concave_number_above_point2 = 0;
unsigned int convex_assist_above_point1 = 0;
unsigned int convex_assist_above_point2 = 0;
unsigned int point_y1 = 0;
unsigned int point_y2 = 0;
int number_result = -1;
bool recognition_flag = false;  //开始手部数字识别的标志

vector<Scalar> color_array;//采用默认的10种颜色
{
color_array.push_back(Scalar(255, 0, 0));
color_array.push_back(Scalar(0, 255, 0));
color_array.push_back(Scalar(0, 0, 255));
color_array.push_back(Scalar(255, 0, 255));
color_array.push_back(Scalar(255, 255, 0));
color_array.push_back(Scalar(0, 255, 255));
color_array.push_back(Scalar(128, 255, 0));
color_array.push_back(Scalar(0, 128, 255));
color_array.push_back(Scalar(255, 0, 128));
color_array.push_back(Scalar(255, 128, 255));
}
vector<unsigned int> hand_depth(MAX_HANDS_NUMBER, 0);
vector<Rect> hands_roi(MAX_HANDS_NUMBER, Rect(XRES/2, YRES/2, ROI_HAND_WIDTH, ROI_HAND_HEIGHT));

namedWindow("color image", CV_WINDOW_AUTOSIZE);
namedWindow("depth image", CV_WINDOW_AUTOSIZE);
namedWindow("hand_segment", CV_WINDOW_AUTOSIZE);    //显示分割出来的手的区域
namedWindow("handrecognition", CV_WINDOW_AUTOSIZE); //显示0~5数字识别的图像

COpenNI openni;
if(!openni.Initial())
return 1;

if(!openni.Start())
return 1;
while(1) {
if(!openni.UpdateData()) {
return 1;
}
/*获取并显示色彩图像*/
Mat color_image;
cvtColor(color_image_src, color_image, CV_RGB2BGR);

for(auto itUser = openni.hand_points_.cbegin(); itUser != openni.hand_points_.cend(); ++itUser) {

point_y1 = itUser->second.Y;
point_y2 = itUser->second.Y + DELTA_POINT_DISTENCE;
circle(color_image, Point(itUser->second.X, itUser->second.Y),
5, color_array.at(itUser->first % color_array.size()), 3, 8);

/*设置不同手部的深度*/
hand_depth.at(itUser->first % MAX_HANDS_COLOR) = (unsigned int)(itUser->second.Z* DEPTH_SCALE_FACTOR);//itUser->first会导致程序出现bug

/*设置不同手部的不同感兴趣区域*/
hands_roi.at(itUser->first % MAX_HANDS_NUMBER) = Rect(itUser->second.X - ROI_HAND_WIDTH/2, itUser->second.Y - ROI_HAND_HEIGHT/2,
ROI_HAND_WIDTH, ROI_HAND_HEIGHT);
hands_roi.at(itUser->first % MAX_HANDS_NUMBER).x =  itUser->second.X - ROI_HAND_WIDTH/2;
hands_roi.at(itUser->first % MAX_HANDS_NUMBER).y =  itUser->second.Y - ROI_HAND_HEIGHT/2;
hands_roi.at(itUser->first % MAX_HANDS_NUMBER).width = ROI_HAND_WIDTH;
hands_roi.at(itUser->first % MAX_HANDS_NUMBER).height = ROI_HAND_HEIGHT;
if(hands_roi.at(itUser->first % MAX_HANDS_NUMBER).x <= 0)
hands_roi.at(itUser->first % MAX_HANDS_NUMBER).x  = 0;
if(hands_roi.at(itUser->first % MAX_HANDS_NUMBER).x > XRES)
hands_roi.at(itUser->first % MAX_HANDS_NUMBER).x =  XRES;
if(hands_roi.at(itUser->first % MAX_HANDS_NUMBER).y <= 0)
hands_roi.at(itUser->first % MAX_HANDS_NUMBER).y = 0;
if(hands_roi.at(itUser->first % MAX_HANDS_NUMBER).y > YRES)
hands_roi.at(itUser->first % MAX_HANDS_NUMBER).y =  YRES;
}
imshow("color image", color_image);

/*获取并显示深度图像*/
Mat depth_image;
depth_image_src.convertTo(depth_image, CV_8U, DEPTH_SCALE_FACTOR);
imshow("depth image", depth_image);

for(auto itUser = openni.hand_points_.cbegin(); itUser != openni.hand_points_.cend(); ++itUser) {
for(int i = hands_roi.at(itUser->first % MAX_HANDS_NUMBER).x; i < std::min(hands_roi.at(itUser->first % MAX_HANDS_NUMBER).x+hands_roi.at(itUser->first % MAX_HANDS_NUMBER).width, XRES); i++)
for(int j = hands_roi.at(itUser->first % MAX_HANDS_NUMBER).y; j < std::min(hands_roi.at(itUser->first % MAX_HANDS_NUMBER).y+hands_roi.at(itUser->first % MAX_HANDS_NUMBER).height, YRES); j++) {
hand_segment_mask.at<unsigned char>(j, i) = ((hand_depth.at(itUser->first % MAX_HANDS_NUMBER)-DEPTH_SEGMENT_THRESH) < depth_image.at<unsigned char>(j, i))
& ((hand_depth.at(itUser->first % MAX_HANDS_NUMBER)+DEPTH_SEGMENT_THRESH) > depth_image.at<unsigned char>(j,i));
}
}
Mat hand_segment(color_image.size(), CV_8UC3);

std::vector< std::vector<Point> > contours;
Mat hand_recognition_image = Mat::zeros(color_image.rows, color_image.cols, CV_8UC3);

for(int i = 0; i < contours.size(); i++) {  //只有在检测到轮廓时才会去求它的多边形，凸包集，凹陷集
recognition_flag = true;
/*找出轮廓图像多边形拟合曲线*/
Mat contour_mat = Mat(contours[i]);
if(contourArea(contour_mat) > HAND_LIKELY_AREA) {   //比较有可能像手的区域
std::vector<Point> approx_poly_curve;
approxPolyDP(contour_mat, approx_poly_curve, 10, true);//找出轮廓的多边形拟合曲线
std::vector< std::vector<Point> > approx_poly_curve_debug;
approx_poly_curve_debug.push_back(approx_poly_curve);

drawContours(hand_recognition_image, contours, i, Scalar(255, 0, 0), 1, 8); //画出轮廓
//            drawContours(hand_recognition_image, approx_poly_curve_debug, 0, Scalar(256, 128, 128), 1, 8); //画出多边形拟合曲线

/*对求出的多边形拟合曲线求出其凸包集*/
vector<int> hull;
convexHull(Mat(approx_poly_curve), hull, true);
for(int i = 0; i < hull.size(); i++) {
circle(hand_recognition_image, approx_poly_curve[hull[i]], 2, Scalar(0, 255, 0), 2, 8);

/*统计在中心点1以上凸点的个数*/
if(approx_poly_curve[hull[i]].y <= point_y1) {
/*统计凸点与中心点1的y轴距离*/
long dis_point1 = abs(long(point_y1 - approx_poly_curve[hull[i]].y));
int dis1 = point_y1 - approx_poly_curve[hull[i]].y;
if(dis_point1 > SEGMENT_POINT1_DISTANCE && dis1 >= 0)  {
convex_assist_above_point1++;
}
convex_number_above_point1++;
}

/*统计在中心点2以上凸点的个数*/
if(approx_poly_curve[hull[i]].y <= point_y2)    {
/*统计凸点与中心点1的y轴距离*/
long dis_point2 = abs(long(point_y2 - approx_poly_curve[hull[i]].y));
int dis2 = point_y2 - approx_poly_curve[hull[i]].y;
if(dis_point2 > SEGMENT_POINT2_DISTANCE && dis2 >= 0)  {
convex_assist_above_point2++;
}
convex_number_above_point2++;
}
}

//            /*对求出的多边形拟合曲线求出凹陷集*/
std::vector<Vec4i> convexity_defects;
if(Mat(approx_poly_curve).checkVector(2, CV_32S) > 3)
convexityDefects(approx_poly_curve, Mat(hull), convexity_defects);
for(int i = 0; i < convexity_defects.size(); i++) {
circle(hand_recognition_image, approx_poly_curve[convexity_defects[i][2]] , 2, Scalar(0, 0, 255), 2, 8);

/*统计在中心点1以上凹陷点的个数*/
if(approx_poly_curve[convexity_defects[i][2]].y <= point_y1)
concave_number_above_point1++;

/*统计在中心点2以上凹陷点的个数*/
if(approx_poly_curve[convexity_defects[i][2]].y <= point_y2)
concave_number_above_point2++;
}
}
}

/**画出手势的中心点**/
for(auto itUser = openni.hand_points_.cbegin(); itUser != openni.hand_points_.cend(); ++itUser) {
circle(hand_recognition_image, Point(itUser->second.X, itUser->second.Y), 3, Scalar(0, 255, 255), 3, 8);
circle(hand_recognition_image, Point(itUser->second.X, itUser->second.Y + 25), 3, Scalar(255, 0, 255), 3, 8);
}

/*手势数字0~5的识别*/
//"0"的识别
if((convex_assist_above_point1 ==0 && convex_number_above_point2 >= 2 && convex_number_above_point2 <= 3 &&
concave_number_above_point2 <= 1 && concave_number_above_point1 <= 1) || (concave_number_above_point1 ==0
|| concave_number_above_point2 == 0) && recognition_flag == true)
number_result = 0;
//"1"的识别
if(convex_assist_above_point1 ==1 && convex_number_above_point1 >=1  && convex_number_above_point1 <=2 &&
convex_number_above_point2 >=2 && convex_assist_above_point2 == 1)
number_result = 1;
//"2"的识别
if(convex_number_above_point1 == 2 && concave_number_above_point1 == 1 && convex_assist_above_point2 == 2
/*convex_assist_above_point1 <=1*/ && concave_number_above_point2 == 1)
number_result = 2;
//"3"的识别
if(convex_number_above_point1 == 3 && concave_number_above_point1 <= 3 &&
concave_number_above_point1 >=1 && convex_number_above_point2 >= 3 && convex_number_above_point2 <= 4 &&
convex_assist_above_point2 == 3)
number_result = 3;
//"4"的识别
if(convex_number_above_point1 == 4 && concave_number_above_point1 <=3 && concave_number_above_point1 >=2 &&
convex_number_above_point2 == 4)
number_result = 4;
//"5"的识别
if(convex_number_above_point1 >=4 && convex_number_above_point2 == 5 && concave_number_above_point2 >= 3 &&
convex_number_above_point2 >= 4)
number_result = 5;
if(number_result !=0 && number_result != 1  && number_result != 2 && number_result != 3 && number_result != 4 && number_result != 5)
number_result == -1;

/*在手势识别图上显示匹配的数字*/
std::stringstream number_str;
number_str << number_result;
putText(hand_recognition_image, "Match: ", Point(0, 60), 4, 1, Scalar(0, 255, 0), 2, 0 );
if(number_result == -1)
putText(hand_recognition_image, " ", Point(120, 60), 4, 2, Scalar(255, 0 ,0), 2, 0);
else
putText(hand_recognition_image, number_str.str(), Point(150, 60), 4, 2, Scalar(255, 0 ,0), 2, 0);

imshow("handrecognition", hand_recognition_image);
imshow("hand_segment", hand_segment);

/*一个循环中对有些变量进行初始化操作*/
convex_number_above_point1 = 0;
convex_number_above_point2 = 0;
concave_number_above_point1 = 0;
concave_number_above_point2 = 0;
convex_assist_above_point1 = 0;
convex_assist_above_point2 = 0;
number_result = -1;
recognition_flag = false;
number_str.clear();

waitKey(20);
}

}
```

## 3 深度学习方法做手势识别

### 3.2 YOLO系列

YOLO 系列的网络模型最早源于 2016 年， 之后几年经过不断改进相继推出YOLOv2、 YOLOv3 等网络，直到今日yoloV5也诞生了，不得不感慨一句，darknet是真的肝。

### 3.3 SSD

SSD 作为典型的一阶段网络模型， 具有更高的操作性， 端到端的学习模式同样受到众多研究者的喜爱

### 3.4 实现步骤

3.4.1 数据集

• 图像大小：100*100
• 像素颜色空间：RGB种类
• 图片种类：6 种(0,1,2,3,4,5)
• 每种图片数量：200 张

3.4.2 图像预处理

3.4.3 构建卷积神经网络结构

Dropout： 增加鲁棒性帮助正则化和避免过拟合

3.4.4 实验训练过程及结果

### 3.5 关键代码

```# 作者：丹成学长 Q746876041， 需要完整代码联系学长获取
import tensorflow as tf
IMAGE_SIZE = 100
NUM_CHANNELS = 1
CONV1_SIZE = 4
CONV1_KERNEL_NUM = 8
CONV2_SIZE = 2
CONV2_KERNEL_NUM = 16
FC_SIZE = 512
OUTPUT_NODE = 6

def get_weight(shape, regularizer):
w = tf.Variable(tf.truncated_normal(shape,stddev=0.1))
if regularizer != None: tf.add_to_collection("losses", tf.contrib.layers.l2_regularizer(regularizer)(w))
return w

def get_bias(shape):
b = tf.Variable(tf.zeros(shape))
return b

def conv2d(x,w):
return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding="SAME")

def max_pool_8x8(x):
return tf.nn.max_pool(x, ksize=[1, 8, 8, 1], strides=[1, 4, 4, 1], padding="SAME")

def max_pool_4x4(x):
return tf.nn.max_pool(x, ksize=[1, 4, 4, 1], strides=[1, 2, 2, 1], padding="SAME")

def forward(x, train, regularizer):

conv1_w = get_weight([CONV1_SIZE, CONV1_SIZE, NUM_CHANNELS, CONV1_KERNEL_NUM], regularizer)
conv1_b = get_bias([CONV1_KERNEL_NUM])
conv1 = conv2d(x, conv1_w)
pool1 = max_pool_8x8(relu1)

conv2_w = get_weight([CONV2_SIZE, CONV2_SIZE, CONV1_KERNEL_NUM, CONV2_KERNEL_NUM],regularizer)
conv2_b = get_bias([CONV2_KERNEL_NUM])
conv2 = conv2d(pool1, conv2_w)
pool2 = max_pool_4x4(relu2)

pool_shape = pool2.get_shape().as_list()
nodes = pool_shape[1] * pool_shape[2] * pool_shape[3]
reshaped = tf.reshape(pool2, [pool_shape[0], nodes])

fc1_w = get_weight([nodes, FC_SIZE], regularizer)
fc1_b = get_bias([FC_SIZE])
fc1 = tf.nn.relu(tf.matmul(reshaped, fc1_w) + fc1_b)
if train: fc1 = tf.nn.dropout(fc1, 0.5)

fc2_w = get_weight([FC_SIZE, OUTPUT_NODE], regularizer)
fc2_b = get_bias([OUTPUT_NODE])
y = tf.matmul(fc1, fc2_w) + fc2_b
return y
```
```# 作者：丹成学长 Q746876041， 需要完整代码联系学长获取
import tensorflow as tf
import numpy as np
import gesture_forward
import gesture_backward
from image_processing import func5,func6
import cv2

def restore_model(testPicArr):
with tf.Graph().as_default() as tg:

x = tf.placeholder(tf.float32,[
1,
gesture_forward.IMAGE_SIZE,
gesture_forward.IMAGE_SIZE,
gesture_forward.NUM_CHANNELS])
#y_ = tf.placeholder(tf.float32, [None, mnist_lenet5_forward.OUTPUT_NODE])
y = gesture_forward.forward(x,False,None)

preValue = tf.argmax(y, 1)

variable_averages = tf.train.ExponentialMovingAverage(gesture_backward.MOVING_AVERAGE_DECAY)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)

with tf.Session() as sess:
ckpt = tf.train.get_checkpoint_state(gesture_backward.MODEL_SAVE_PATH)
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
#global_step = ckpt.model_checkpoint_path.split("/")[-1].split("-")[-1]
preValue = sess.run(preValue, feed_dict={x:testPicArr})
return preValue
else:
print("No checkpoint file found")
return -1

def application01():
testNum = input("input the number of test pictures:")
testNum = int(testNum)
for i in range(testNum):
testPic = input("the path of test picture:")
img = func5(testPic)
cv2.imwrite(str(i)+"ttt.jpg",img)
#        cv2.waitKey(0)
#        cv2.destroyAllWindows()
img = img.reshape([1,100,100,1])
img = img.astype(np.float32)
img = np.multiply(img, 1.0/255.0)
#        print(img.shape)
#        print(type(img))
preValue = restore_model(img)
print ("The prediction number is:", preValue)

def application02():

#vc = cv2.VideoCapture("testVideo.mp4")
vc = cv2.VideoCapture(0)
# 设置每秒传输帧数
fps = vc.get(cv2.CAP_PROP_FPS)
# 获取视频的大小
size = (int(vc.get(cv2.CAP_PROP_FRAME_WIDTH)),int(vc.get(cv2.CAP_PROP_FRAME_HEIGHT)))
# 生成一个空的视频文件
# 视频编码类型
# cv2.VideoWriter_fourcc("X","V","I","D") MPEG-4 编码类型
# cv2.VideoWriter_fourcc("I","4","2","0") YUY编码类型
# cv2.VideoWriter_fourcc("P","I","M","I") MPEG-1 编码类型
# cv2.VideoWriter_fourcc("T","H","E","O") Ogg Vorbis类型，文件名为.ogv
#vw = cv2.VideoWriter("ges_pro.avi",cv2.VideoWriter_fourcc("X","V","I","D"), fps, size)
# 读取视频第一帧的内容
#    rows = frame.shape[0]
#    cols = frame.shape[1]
#    t1 = int((cols-rows)/2)
#    t2 = int(cols-t1)
#    M = cv2.getRotationMatrix2D((cols/2,rows/2),90,1)
#    frame = cv2.warpAffine(frame,M,(cols,rows))
#    frame = frame[0:rows, t1:t2]
#    cv2.imshow("sd",frame)
#    cv2.waitKey(0)
#    cv2.destroyAllWindows()
while success:

#90度旋转
#        img = cv2.warpAffine(frame,M,(cols,rows))
#        img = img[0:rows, t1:t2]
img = func6(frame)
img = img.reshape([1,100,100,1])
img = img.astype(np.float32)
img = np.multiply(img, 1.0/255.0)
preValue = restore_model(img)
# 写入视频
cv2.putText(frame,"Gesture:"+str(preValue),(50,50),cv2.FONT_HERSHEY_PLAIN,2.0,(0,0,255),1)
#vw.write(frame)
cv2.imshow("gesture",frame)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
# 读取视频下一帧的内容

vc.release()
cv2.destroyAllWindows()
print("viedo app over!")

def main():
#application01()
application02()

if __name__ == "__main__":
main()
```