使用TensorFlow.js进行AI在网络摄像头中翻译手势和手语

起点

检测手势

其他手势和手语

技术脚注

终点线

下一步是什么？

TensorFlow + JavaScript。现在，最流行，最先进的AI框架支持地球上使用最广泛的编程语言，因此，让我们在我们的web浏览器中通过深度学习实现奇迹，通过TensorFlow.js的WebGL GPU加速！

这是我们六个系列的最后一篇文章：

使用TensorFlow.js在浏览器中进行深度学习入门
狗和披萨：使用TensorFlow.js在浏览器中实现计算机视觉
绒毛动物探测器：通过TensorFlow.js中的迁移学习识别浏览器中的自定义对象
使用TensorFlow.js进行人脸触摸检测第1部分：将实时网络摄像头数据与深度学习配合使用
使用TensorFlow.js进行人脸触摸检测第2部分：使用BodyPix
使用TensorFlow.js进行AI在网络摄像头中翻译手势和手语

在本文中，我们将通过网络摄像头拍摄不同手势的照片，并在预先训练的MobileNet模型上使用转移学习来构建可以实时识别各种手势的计算机视觉AI。

起点

为了识别多个手势，我们将使用几乎可用的入门代码并将其扩展以检测更多类别的对象。代码将执行以下操作：

导入TensorFlow.js和TensorFlow的tf-data.js
定义触摸与非触摸类别标签
为网络摄像头添加视频元素
首次训练后，每200毫秒运行一次模型预测
显示预测结果
加载预先训练的MobileNet模型，并准备将转移学习分类为与标签一样多的类别
训练和分类图像中的各种自定义对象
跳过在训练过程中放置图像和目标样本的过程，以保持它们进行多次训练

这是该项目的起点：


    
        
        Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js
        
        
        
            img, video {
                object-fit: cover;
            }
        
    
    
        
        
            None
            ✊ (Rock)
            🖐 (Paper)
            ✌️ (Scissors)
            Train
        
        Loading...
        
        let trainingData = [];

        const labels = [
            "None",
            "✊ (Rock)",
            "🖐 (Paper)",
            "✌️ (Scissors)",
        ];

        function setText( text ) {
            document.getElementById( "status" ).innerText = text;
        }

        async function predictImage() {
            if( !hasTrained ) { return; } // Skip prediction until trained
            const img = await getWebcamImage();
            let result = tf.tidy( () => {
                const input = img.reshape( [ 1, 224, 224, 3 ] );
                return model.predict( input );
            });
            img.dispose();
            let prediction = await result.data();
            result.dispose();
            // Get the index of the highest value in the prediction
            let id = prediction.indexOf( Math.max( ...prediction ) );
            setText( labels[ id ] );
        }

        function createTransferModel( model ) {
            // Create the truncated base model (remove the "top" layers, classification + bottleneck layers)
            const bottleneck = model.getLayer( "dropout" ); // This is the final layer before the conv_pred pre-trained classification layer
            const baseModel = tf.model({
                inputs: model.inputs,
                outputs: bottleneck.output
            });
            // Freeze the convolutional base
            for( const layer of baseModel.layers ) {
                layer.trainable = false;
            }
            // Add a classification head
            const newHead = tf.sequential();
            newHead.add( tf.layers.flatten( {
                inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
            } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( {
                units: labels.length,
                kernelInitializer: 'varianceScaling',
                useBias: false,
                activation: 'softmax'
            } ) );
            // Build the new model
            const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
            const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
            return newModel;
        }

        async function trainModel() {
            hasTrained = false;
            setText( "Training..." );

            // Setup training data
            const imageSamples = [];
            const targetSamples = [];
            trainingData.forEach( sample => {
                imageSamples.push( sample.image );
                let cat = [];
                for( let c = 0; c < labels.length; c++ ) {
                    cat.push( c === sample.category ? 1 : 0 );
                }
                targetSamples.push( tf.tensor1d( cat ) );
            });
            const xs = tf.stack( imageSamples );
            const ys = tf.stack( targetSamples );

            // Train the model on new image samples
            model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );

            await model.fit( xs, ys, {
                epochs: 30,
                shuffle: true,
                callbacks: {
                    onEpochEnd: ( epoch, logs ) => {
                        console.log( "Epoch #", epoch, logs );
                    }
                }
            });
            hasTrained = true;
        }

        // Mobilenet v1 0.25 224x224 model
        const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";

        let model = null;
        let hasTrained = false;

        async function setupWebcam() {
            return new Promise( ( resolve, reject ) => {
                const webcamElement = document.getElementById( "webcam" );
                const navigatorAny = navigator;
                navigator.getUserMedia = navigator.getUserMedia ||
                navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
                navigatorAny.msGetUserMedia;
                if( navigator.getUserMedia ) {
                    navigator.getUserMedia( { video: true },
                        stream => {
                            webcamElement.srcObject = stream;
                            webcamElement.addEventListener( "loadeddata", resolve, false );
                        },
                    error => reject());
                }
                else {
                    reject();
                }
            });
        }

        async function getWebcamImage() {
            const img = ( await webcam.capture() ).toFloat();
            const normalized = img.div( 127 ).sub( 1 );
            return normalized;
        }

        async function captureSample( category ) {
            trainingData.push( {
                image: await getWebcamImage(),
                category: category
            });
            setText( "Captured: " + labels[ category ] );
        }

        let webcam = null;

        (async () => {
            // Load the model
            model = await tf.loadLayersModel( mobilenet );
            model = createTransferModel( model );
            await setupWebcam();
            webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
            // Setup prediction every 200 ms
            setInterval( predictImage, 200 );
        })();

检测手势

构建起点可以检测四个不同的类别：无，拳头，手掌，剪刀手。您可以使用网络摄像头尝试此操作，方法是：在握住每个手势的同时单击每个类别按钮以捕获一些照片（5-6是一个很好的示例），然后单击“训练”按钮将学习转移到神经网络。之后，可以通过拍摄更多照片并再次单击“训练”按钮来改进模型。

其他手势和手语

如您所料，添加更多类别对于AI而言变得更加困难，并且需要花费更多时间。但是，结果很有趣，并且即使对于每个类别仅使用几张照片，人工智能也表现良好。让我们尝试添加一些美国手语（ASL）手势。

要添加更多，您可以在输入列表中包括更多按钮，更新传递到captureSample()中的数字，并相应地修改labels数组。

您可以添加任何您想要的标志。我尝试添加四个表情符号集的一部分：

👌（字母D）
👍（竖起大拇指）
🖖（火神）
🤟（ILY-我爱你）

技术脚注

如果AI似乎无法很好地识别您的手势，请尝试拍摄更多照片，然后多次训练模型。
在用各种手势训练模型时，请记住，它可以看到完整的图像。它不一定知道手本身就能区分类别。没有来自不同手的大量样本，可能很难准确识别不同的手势。
该模型有时会学习区分左手和右手，有时则不会，这可能会影响多轮训练后的预测。

终点线

供您参考，这是此项目的完整代码：


    
        
        Interpreting Hand Gestures and Sign Language in the Webcam with AI using TensorFlow.js
        
        
        
            img, video {
                object-fit: cover;
            }
        
    
    
        
        
            None
            ✊ (Rock)
            🖐 (Paper)
            ✌️ (Scissors)
            👌 (Letter D)
            👍 (Thumb Up)
            🖖 (Vulcan)
            🤟 (ILY - I Love You)
            Train
        
        Loading...
        
        let trainingData = [];

        const labels = [
            "None",
            "✊ (Rock)",
            "🖐 (Paper)",
            "✌️ (Scissors)",
            "👌 (Letter D)",
            "👍 (Thumb Up)",
            "🖖 (Vulcan)",
            "🤟 (ILY - I Love You)"
        ];

        function setText( text ) {
            document.getElementById( "status" ).innerText = text;
        }

        async function predictImage() {
            if( !hasTrained ) { return; } // Skip prediction until trained
            const img = await getWebcamImage();
            let result = tf.tidy( () => {
                const input = img.reshape( [ 1, 224, 224, 3 ] );
                return model.predict( input );
            });
            img.dispose();
            let prediction = await result.data();
            result.dispose();
            // Get the index of the highest value in the prediction
            let id = prediction.indexOf( Math.max( ...prediction ) );
            setText( labels[ id ] );
        }

        function createTransferModel( model ) {
            // Create the truncated base model (remove the "top" layers, classification + bottleneck layers)
            const bottleneck = model.getLayer( "dropout" ); // This is the final layer before the conv_pred pre-trained classification layer
            const baseModel = tf.model({
                inputs: model.inputs,
                outputs: bottleneck.output
            });
            // Freeze the convolutional base
            for( const layer of baseModel.layers ) {
                layer.trainable = false;
            }
            // Add a classification head
            const newHead = tf.sequential();
            newHead.add( tf.layers.flatten( {
                inputShape: baseModel.outputs[ 0 ].shape.slice( 1 )
            } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 100, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( { units: 10, activation: 'relu' } ) );
            newHead.add( tf.layers.dense( {
                units: labels.length,
                kernelInitializer: 'varianceScaling',
                useBias: false,
                activation: 'softmax'
            } ) );
            // Build the new model
            const newOutput = newHead.apply( baseModel.outputs[ 0 ] );
            const newModel = tf.model( { inputs: baseModel.inputs, outputs: newOutput } );
            return newModel;
        }

        async function trainModel() {
            hasTrained = false;
            setText( "Training..." );

            // Setup training data
            const imageSamples = [];
            const targetSamples = [];
            trainingData.forEach( sample => {
                imageSamples.push( sample.image );
                let cat = [];
                for( let c = 0; c < labels.length; c++ ) {
                    cat.push( c === sample.category ? 1 : 0 );
                }
                targetSamples.push( tf.tensor1d( cat ) );
            });
            const xs = tf.stack( imageSamples );
            const ys = tf.stack( targetSamples );

            // Train the model on new image samples
            model.compile( { loss: "meanSquaredError", optimizer: "adam", metrics: [ "acc" ] } );

            await model.fit( xs, ys, {
                epochs: 30,
                shuffle: true,
                callbacks: {
                    onEpochEnd: ( epoch, logs ) => {
                        console.log( "Epoch #", epoch, logs );
                    }
                }
            });
            hasTrained = true;
        }

        // Mobilenet v1 0.25 224x224 model
        const mobilenet = "https://storage.googleapis.com/tfjs-models/tfjs/mobilenet_v1_0.25_224/model.json";

        let model = null;
        let hasTrained = false;

        async function setupWebcam() {
            return new Promise( ( resolve, reject ) => {
                const webcamElement = document.getElementById( "webcam" );
                const navigatorAny = navigator;
                navigator.getUserMedia = navigator.getUserMedia ||
                navigatorAny.webkitGetUserMedia || navigatorAny.mozGetUserMedia ||
                navigatorAny.msGetUserMedia;
                if( navigator.getUserMedia ) {
                    navigator.getUserMedia( { video: true },
                        stream => {
                            webcamElement.srcObject = stream;
                            webcamElement.addEventListener( "loadeddata", resolve, false );
                        },
                    error => reject());
                }
                else {
                    reject();
                }
            });
        }

        async function getWebcamImage() {
            const img = ( await webcam.capture() ).toFloat();
            const normalized = img.div( 127 ).sub( 1 );
            return normalized;
        }

        async function captureSample( category ) {
            trainingData.push( {
                image: await getWebcamImage(),
                category: category
            });
            setText( "Captured: " + labels[ category ] );
        }

        let webcam = null;

        (async () => {
            // Load the model
            model = await tf.loadLayersModel( mobilenet );
            model = createTransferModel( model );
            await setupWebcam();
            webcam = await tf.data.webcam( document.getElementById( "webcam" ) );
            // Setup prediction every 200 ms
            setInterval( predictImage, 200 );
        })();

下一步是什么？

该项目向您展示了如何开始训练自己的计算机视觉AI，以识别可能不受限制的手势、物体、动物物种甚至食物类型。其余的取决于您；深度学习和AI的未来可能会在您的浏览器中开始。

希望您喜欢以下示例。并且，随着您尝试更多的想法，不要忘了获得乐趣！

https://www.codeproject.com/Articles/5272777/Interpreting-Hand-Gestures-and-Sign-Language-in-th

使用TensorFlow.js进行AI在网络摄像头中翻译手势和手语

最近更新

热门博客

[ 申请 ]友情链接：