diff --git a/.bazelrc b/.bazelrc new file mode 100644 index 0000000..9f9b947 --- /dev/null +++ b/.bazelrc @@ -0,0 +1,113 @@ +# The bazelrc file for MediaPipe OSS. + +# Tensorflow needs remote repo +common --experimental_repo_remote_exec + +# Basic build settings +build --jobs 128 +build --define='absl=1' # for gtest +build --enable_platform_specific_config + +# Enable stack traces +test --test_env="GTEST_INSTALL_FAILURE_SIGNAL_HANDLER=1" + +# Linux +build:linux --cxxopt=-std=c++17 +build:linux --host_cxxopt=-std=c++17 +build:linux --copt=-w + +# windows +build:windows --cxxopt=/std:c++17 +build:windows --host_cxxopt=/std:c++17 +build:windows --copt=/w +# For using M_* math constants on Windows with MSVC. +build:windows --copt=/D_USE_MATH_DEFINES +build:windows --host_copt=/D_USE_MATH_DEFINES + +# macOS +build:macos --cxxopt=-std=c++17 +build:macos --host_cxxopt=-std=c++17 +build:macos --copt=-w + +# Sets the default Apple platform to macOS. +build --apple_platform_type=macos + +# Compile ObjC++ files with C++17 +build --per_file_copt=.*\.mm\$@-std=c++17 + +# Allow debugging with XCODE +build --apple_generate_dsym + +# Android configs. +# Note: the documentation tells us to use @androidndk//:default_crosstool, but +# the automatic configuration transition uses //external:android/crosstool. +# Using it here works and spares us from having two different config_settings +# for Android. +build:android --crosstool_top=//external:android/crosstool +build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain +build:android --linkopt=-landroid +build:android --linkopt=-ldl +build:android --linkopt=-llog +build:android --linkopt=-lm +build:android --linkopt=-Wl,--gc-sections +# TODO: Remove this flag once we updated to NDK 25 +build:android --define=xnn_enable_arm_i8mm=false + +build:android_arm --config=android +build:android_arm --cpu=armeabi-v7a +build:android_arm --fat_apk_cpu=armeabi-v7a + +build:android_arm64 --config=android +build:android_arm64 --cpu=arm64-v8a +build:android_arm64 --fat_apk_cpu=arm64-v8a + +# iOS configs. +build:ios --apple_platform_type=ios +build:ios --copt=-fno-aligned-allocation + +build:ios_i386 --config=ios +build:ios_i386 --cpu=ios_i386 +build:ios_i386 --watchos_cpus=i386 + +build:ios_x86_64 --config=ios +build:ios_x86_64 --cpu=ios_x86_64 +build:ios_x86_64 --watchos_cpus=i386 + +build:ios_armv7 --config=ios +build:ios_armv7 --cpu=ios_armv7 +build:ios_armv7 --watchos_cpus=armv7k + +build:ios_arm64 --config=ios +build:ios_arm64 --cpu=ios_arm64 +build:ios_arm64 --watchos_cpus=armv7k + +build:ios_arm64e --config=ios +build:ios_arm64e --cpu=ios_arm64e +build:ios_arm64e --watchos_cpus=armv7k + +build:ios_fat --config=ios +build:ios_fat --ios_multi_cpus=armv7,arm64 +build:ios_fat --watchos_cpus=armv7k + +build:ios_sim_fat --config=ios +build:ios_sim_fat --ios_multi_cpus=x86_64,sim_arm64 + +build:ios_sim_device_fat --config=ios +build:ios_sim_device_fat --ios_multi_cpus=x86_64,sim_arm64,arm64 + +build:darwin_x86_64 --apple_platform_type=macos +build:darwin_x86_64 --macos_minimum_os=10.12 +build:darwin_x86_64 --cpu=darwin_x86_64 + +build:darwin_arm64 --apple_platform_type=macos +build:darwin_arm64 --macos_minimum_os=10.16 +build:darwin_arm64 --cpu=darwin_arm64 + +# Turn off maximum stdout size +build --experimental_ui_max_stdouterr_bytes=-1 + +# This bazelrc file is meant to be written by a setup script. +try-import %workspace%/.configure.bazelrc + +# This bazelrc file can be used for user-specific custom build settings. +try-import %workspace%/.user.bazelrc diff --git a/.bazelversion b/.bazelversion new file mode 100644 index 0000000..f3b5af3 --- /dev/null +++ b/.bazelversion @@ -0,0 +1 @@ +6.1.1 diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..331d387 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +.git +Dockerfile diff --git a/.github/ISSUE_TEMPLATE/00-task-issue-template.yaml b/.github/ISSUE_TEMPLATE/00-task-issue-template.yaml new file mode 100644 index 0000000..d6130ed --- /dev/null +++ b/.github/ISSUE_TEMPLATE/00-task-issue-template.yaml @@ -0,0 +1,70 @@ +name: Task Issue +description: Use this template for assistance with using MediaPipe Tasks (developers.google.com/mediapipe/solutions) to deploy on-device ML solutions (e.g. gesture recognition etc.) on supported platforms +labels: 'type:task' +body: + - type: markdown + id: linkmodel + attributes: + value: Please make sure that this is a [Tasks](https://developers.google.com/mediapipe/solutions) issue. + - type: dropdown + id: customcode_model + attributes: + label: Have I written custom code (as opposed to using a stock example script provided in MediaPipe) + options: + - 'Yes' + - 'No' + validations: + required: false + - type: input + id: os_model + attributes: + label: OS Platform and Distribution + placeholder: e.g. Linux Ubuntu 16.04, Android 11, iOS 14.4 + validations: + required: true + - type: input + id: task-sdk-version + attributes: + label: MediaPipe Tasks SDK version + validations: + required: false + - type: input + id: taskname + attributes: + label: Task name (e.g. Image classification, Gesture recognition etc.) + validations: + required: true + - type: input + id: programminglang + attributes: + label: Programming Language and version (e.g. C++, Python, Java) + validations: + required: true + - type: input + id: current_model + attributes: + label: Describe the actual behavior + validations: + required: true + - type: input + id: expected_model + attributes: + label: Describe the expected behaviour + validations: + required: true + - type: textarea + id: what-happened_model + attributes: + label: Standalone code/steps you may have used to try to get what you need + description: If there is a problem, provide a reproducible test case that is the bare minimum necessary to generate the problem. If possible, please share a link to Colab, GitHub repo link or anything that we can use to reproduce the problem + render: shell + validations: + required: true + - type: textarea + id: other_info + attributes: + label: Other info / Complete Logs + description: Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached + render: shell + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/11-model-maker-issue-template.yaml b/.github/ISSUE_TEMPLATE/11-model-maker-issue-template.yaml new file mode 100644 index 0000000..7a6d921 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/11-model-maker-issue-template.yaml @@ -0,0 +1,71 @@ +name: Model Maker Issues +description: Use this template for assistance with using MediaPipe Model Maker (developers.google.com/mediapipe/solutions) to create custom on-device ML solutions. +labels: 'type:modelmaker' +body: + - type: markdown + id: linkmodel + attributes: + value: Please make sure that this is a [Model Maker](https://developers.google.com/mediapipe/solutions) issue + - type: dropdown + id: customcode_model + attributes: + label: Have I written custom code (as opposed to using a stock example script provided in MediaPipe) + options: + - 'Yes' + - 'No' + validations: + required: false + - type: input + id: os_model + attributes: + label: OS Platform and Distribution + placeholder: e.g. Linux Ubuntu 16.04, Android 11, iOS 14.4 + validations: + required: true + - type: input + id: pythonver + attributes: + label: Python Version + placeholder: e.g. 3.7, 3.8 + validations: + required: true + - type: input + id: modelmakerver + attributes: + label: MediaPipe Model Maker version + validations: + required: false + - type: input + id: taskname + attributes: + label: Task name (e.g. Image classification, Gesture recognition etc.) + validations: + required: true + - type: input + id: current_model + attributes: + label: Describe the actual behavior + validations: + required: true + - type: input + id: expected_model + attributes: + label: Describe the expected behaviour + validations: + required: true + - type: textarea + id: what-happened_model + attributes: + label: Standalone code/steps you may have used to try to get what you need + description: If there is a problem, provide a reproducible test case that is the bare minimum necessary to generate the problem. If possible, please share a link to Colab, GitHub repo link or anything that we can use to reproduce the problem + render: shell + validations: + required: true + - type: textarea + id: other_info + attributes: + label: Other info / Complete Logs + description: Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached + render: shell + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/12-studio-issue-template.yaml b/.github/ISSUE_TEMPLATE/12-studio-issue-template.yaml new file mode 100644 index 0000000..ffaa315 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/12-studio-issue-template.yaml @@ -0,0 +1,61 @@ +name: Studio Issues +description: Use this template for assistance with the MediaPipe Studio application. If this doesn’t look right, choose a different type. +labels: 'type:support' +body: + - type: markdown + id: linkmodel + attributes: + value: Please make sure that this is a MediaPipe Studio issue. + - type: input + id: os_model + attributes: + label: OS Platform and Distribution + placeholder: e.g. Linux Ubuntu 16.04, Android 11, iOS 14.4 + validations: + required: false + - type: input + id: browserver + attributes: + label: Browser and Version + validations: + required: false + - type: input + id: hardware + attributes: + label: Any microphone or camera hardware + validations: + required: false + - type: input + id: url + attributes: + label: URL that shows the problem + validations: + required: false + - type: input + id: current_model + attributes: + label: Describe the actual behavior + validations: + required: false + - type: input + id: expected_model + attributes: + label: Describe the expected behaviour + validations: + required: false + - type: textarea + id: what-happened_model + attributes: + label: Standalone code/steps you may have used to try to get what you need + description: If there is a problem, provide a reproducible test case that is the bare minimum necessary to generate the problem. If possible, please share a link to Colab, GitHub repo link or anything that we can use to reproduce the problem + render: shell + validations: + required: false + - type: textarea + id: other_info + attributes: + label: Other info / Complete Logs + description: Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached + render: shell + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/14-feature-request-issue-template.yaml b/.github/ISSUE_TEMPLATE/14-feature-request-issue-template.yaml new file mode 100644 index 0000000..c170891 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/14-feature-request-issue-template.yaml @@ -0,0 +1,60 @@ +name: Feature Request Issues +description: Use this template for raising a feature request. If this doesn’t look right, choose a different type. +labels: 'type:feature' +body: + - type: markdown + id: linkmodel + attributes: + value: Please make sure that this is a feature request. + - type: input + id: solution + attributes: + label: MediaPipe Solution (you are using) + validations: + required: false + - type: input + id: pgmlang + attributes: + label: Programming language + placeholder: C++/typescript/Python/Objective C/Android Java + validations: + required: false + - type: dropdown + id: willingcon + attributes: + label: Are you willing to contribute it + options: + - 'Yes' + - 'No' + validations: + required: false + - type: input + id: behaviour + attributes: + label: Describe the feature and the current behaviour/state + validations: + required: true + - type: input + id: api_change + attributes: + label: Will this change the current API? How? + validations: + required: false + - type: input + id: benifit + attributes: + label: Who will benefit with this feature? + validations: + required: false + - type: input + id: use_case + attributes: + label: Please specify the use cases for this feature + validations: + required: true + - type: input + id: info_other + attributes: + label: Any Other info + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/15-build-install-issue-template.yaml b/.github/ISSUE_TEMPLATE/15-build-install-issue-template.yaml new file mode 100644 index 0000000..ded9d09 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/15-build-install-issue-template.yaml @@ -0,0 +1,108 @@ +name: Build/Install Issue +description: Use this template to report build/install issue +labels: 'type:build/install' +body: + - type: markdown + id: link + attributes: + value: Please make sure that this is a build/installation issue and also refer to the [troubleshooting](https://google.github.io/mediapipe/getting_started/troubleshooting.html) documentation before raising any issues. + - type: input + id: os + attributes: + label: OS Platform and Distribution + description: + placeholder: e.g. Linux Ubuntu 16.04, Android 11, iOS 14.4 + validations: + required: true + - type: input + id: compilerversion + attributes: + label: Compiler version + description: + placeholder: e.g. gcc/g++ 8 /Apple clang version 12.0.0 + validations: + required: false + - type: input + id: programminglang + attributes: + label: Programming Language and version + description: + placeholder: e.g. C++ 14, Python 3.6, Java + validations: + required: true + - type: input + id: virtualenv + attributes: + label: Installed using virtualenv? pip? Conda?(if python) + description: + placeholder: + validations: + required: false + - type: input + id: mediapipever + attributes: + label: MediaPipe version + description: + placeholder: e.g. 0.8.11, 0.9.1 + validations: + required: false + - type: input + id: bazelver + attributes: + label: Bazel version + description: + placeholder: e.g. 5.0, 5.1 + validations: + required: false + - type: input + id: xcodeversion + attributes: + label: XCode and Tulsi versions (if iOS) + description: + placeholder: + validations: + required: false + - type: input + id: sdkndkversion + attributes: + label: Android SDK and NDK versions (if android) + description: + placeholder: + validations: + required: false + - type: dropdown + id: androidaar + attributes: + label: Android AAR (if android) + options: + - 'Yes' + - 'No' + validations: + required: false + - type: input + id: opencvversion + attributes: + label: OpenCV version (if running on desktop) + description: + placeholder: + validations: + required: false + - type: input + id: what-happened + attributes: + label: Describe the problem + description: Provide the exact sequence of commands / steps that you executed before running into the [problem](https://google.github.io/mediapipe/getting_started/getting_started.html) + placeholder: Tell us what you see! + value: "A bug happened!" + validations: + required: true + - type: textarea + id: code-to-reproduce + attributes: + label: Complete Logs + description: Include Complete Log information or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached + placeholder: Tell us what you see! + value: + render: shell + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/16-bug-issue-template.yaml b/.github/ISSUE_TEMPLATE/16-bug-issue-template.yaml new file mode 100644 index 0000000..efa925b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/16-bug-issue-template.yaml @@ -0,0 +1,110 @@ +name: Bug Issues +description: Use this template for reporting a bug. If this doesn’t look right, choose a different type. +labels: 'type:bug' +body: + - type: markdown + id: link + attributes: + value: Please make sure that this is a bug and also refer to the [troubleshooting](https://google.github.io/mediapipe/getting_started/troubleshooting.html), FAQ documentation before raising any issues. + - type: dropdown + id: customcode_model + attributes: + label: Have I written custom code (as opposed to using a stock example script provided in MediaPipe) + options: + - 'Yes' + - 'No' + validations: + required: false + - type: input + id: os + attributes: + label: OS Platform and Distribution + description: + placeholder: e.g. Linux Ubuntu 16.04, Android 11, iOS 14.4 + validations: + required: true + - type: input + id: mobile_device + attributes: + label: Mobile device if the issue happens on mobile device + description: + placeholder: e.g. iPhone 8, Pixel 2, Samsung Galaxy + validations: + required: false + - type: input + id: browser_version + attributes: + label: Browser and version if the issue happens on browser + placeholder: e.g. Google Chrome 109.0.5414.119, Safari 16.3 + validations: + required: false + - type: input + id: programminglang + attributes: + label: Programming Language and version + placeholder: e.g. C++, Python, Java + validations: + required: true + - type: input + id: mediapipever + attributes: + label: MediaPipe version + description: + placeholder: e.g. 0.8.11, 0.9.1 + validations: + required: false + - type: input + id: bazelver + attributes: + label: Bazel version + description: + placeholder: e.g. 5.0, 5.1 + validations: + required: false + - type: input + id: solution + attributes: + label: Solution + placeholder: e.g. FaceMesh, Pose, Holistic + validations: + required: true + - type: input + id: sdkndkversion + attributes: + label: Android Studio, NDK, SDK versions (if issue is related to building in Android environment) + validations: + required: false + - type: input + id: xcode_ver + attributes: + label: Xcode & Tulsi version (if issue is related to building for iOS) + validations: + required: false + - type: input + id: current_model + attributes: + label: Describe the actual behavior + validations: + required: true + - type: input + id: expected_model + attributes: + label: Describe the expected behaviour + validations: + required: true + - type: textarea + id: what-happened_model + attributes: + label: Standalone code/steps you may have used to try to get what you need + description: If there is a problem, provide a reproducible test case that is the bare minimum necessary to generate the problem. If possible, please share a link to Colab, GitHub repo link or anything that we can use to reproduce the problem + render: shell + validations: + required: true + - type: textarea + id: other_info + attributes: + label: Other info / Complete Logs + description: Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached + render: shell + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/17-documentation-issue-template.yaml b/.github/ISSUE_TEMPLATE/17-documentation-issue-template.yaml new file mode 100644 index 0000000..5f022e5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/17-documentation-issue-template.yaml @@ -0,0 +1,73 @@ +name: Documentation issue +description: Use this template for documentation related issues. If this doesn’t look right, choose a different type. +labels: 'type:doc-bug' +body: + - type: markdown + id: link + attributes: + value: Thank you for submitting a MediaPipe documentation issue. The MediaPipe docs are open source! To get involved, read the documentation Contributor Guide + - type: markdown + id: url + attributes: + value: URL(s) with the issue Please provide a link to the documentation entry, for example https://github.com/google/mediapipe/blob/master/docs/solutions/face_mesh.md#models + - type: input + id: description + attributes: + label: Description of issue (what needs changing) + description: Kinds of documentation problems + - type: input + id: clear_desc + attributes: + label: Clear description + description: For example, why should someone use this method? How is it useful? + validations: + required: true + - type: input + id: link + attributes: + label: Correct links + description: Is the link to the source code correct? + validations: + required: false + - type: input + id: parameter + attributes: + label: Parameters defined + description: Are all parameters defined and formatted correctly? + validations: + required: false + - type: input + id: returns + attributes: + label: Returns defined + description: Are return values defined? + validations: + required: false + - type: input + id: raises + attributes: + label: Raises listed and defined + description: Are the errors defined? For example, + validations: + required: false + - type: input + id: usage + attributes: + label: Usage example + description: Is there a usage example? See the API guide-on how to write testable usage examples. + validations: + required: false + - type: input + id: visual + attributes: + label: Request visuals, if applicable + description: Are there currently visuals? If not, will it clarify the content? + validations: + required: false + - type: input + id: pull + attributes: + label: Submit a pull request? + description: Are you planning to also submit a pull request to fix the issue? See the [docs](https://github.com/google/mediapipe/blob/master/CONTRIBUTING.md) + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/18-solution-legacy-issue-template.yaml b/.github/ISSUE_TEMPLATE/18-solution-legacy-issue-template.yaml new file mode 100644 index 0000000..acb0f5b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/18-solution-legacy-issue-template.yaml @@ -0,0 +1,78 @@ +name: Solution(Legacy) Issue +description: Use this template for assistance with a specific Mediapipe solution (google.github.io/mediapipe/solutions) such as "Pose", including inference model usage/training, solution-specific calculators etc. +labels: 'type:support' +body: + - type: markdown + id: linkmodel + attributes: + value: Please make sure that this is a [solution](https://google.github.io/mediapipe/solutions/solutions.html) issue. + - type: dropdown + id: customcode_model + attributes: + label: Have I written custom code (as opposed to using a stock example script provided in MediaPipe) + options: + - 'Yes' + - 'No' + validations: + required: false + - type: input + id: os_model + attributes: + label: OS Platform and Distribution + placeholder: e.g. Linux Ubuntu 16.04, Android 11, iOS 14.4 + validations: + required: false + - type: input + id: mediapipe_version + attributes: + label: MediaPipe version + validations: + required: false + - type: input + id: bazel_version + attributes: + label: Bazel version + validations: + required: false + - type: input + id: solution + attributes: + label: Solution + placeholder: e.g. FaceMesh, Pose, Holistic + validations: + required: false + - type: input + id: programminglang + attributes: + label: Programming Language and version + placeholder: e.g. C++, Python, Java + validations: + required: false + - type: input + id: current_model + attributes: + label: Describe the actual behavior + validations: + required: false + - type: input + id: expected_model + attributes: + label: Describe the expected behaviour + validations: + required: false + - type: textarea + id: what-happened_model + attributes: + label: Standalone code/steps you may have used to try to get what you need + description: If there is a problem, provide a reproducible test case that is the bare minimum necessary to generate the problem. If possible, please share a link to Colab, GitHub repo link or anything that we can use to reproduce the problem + render: shell + validations: + required: false + - type: textarea + id: other_info + attributes: + label: Other info / Complete Logs + description: Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached + render: shell + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/19-other-issues.md b/.github/ISSUE_TEMPLATE/19-other-issues.md new file mode 100644 index 0000000..c590f3f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/19-other-issues.md @@ -0,0 +1,12 @@ +--- +name: "Other Issue" +about: Use this template for any other non-support related issues. +labels: type:others + +--- +This template is for miscellaneous issues not covered by the other issue categories + +For questions on how to work with MediaPipe, or support for problems that are not verified bugs in MediaPipe, please go to [StackOverflow](https://stackoverflow.com/questions/tagged/mediapipe) and [Slack](https://mediapipe.page.link/joinslack) communities. + +If you are reporting a vulnerability, please use the [dedicated reporting process](https://github.com/google/mediapipe/security). + diff --git a/.github/bot_config.yml b/.github/bot_config.yml new file mode 100644 index 0000000..74a60e4 --- /dev/null +++ b/.github/bot_config.yml @@ -0,0 +1,19 @@ +# Copyright 2021 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# A list of assignees +assignees: + - kuaashish + - ayushgdev diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml new file mode 100644 index 0000000..c78273a --- /dev/null +++ b/.github/workflows/stale.yaml @@ -0,0 +1,68 @@ +# Copyright 2023 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# This workflow alerts and then closes the stale issues/PRs after specific time +# You can adjust the behavior by modifying this file. +# For more information, see: +# https://github.com/actions/stale + +name: 'Close stale issues and PRs' +"on": + schedule: + - cron: "30 1 * * *" +permissions: + contents: read + issues: write + pull-requests: write +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: 'actions/stale@v7' + with: + # Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale. + exempt-issue-labels: 'override-stale' + # Comma separated list of labels that can be assigned to PRs to exclude them from being marked as stale. + exempt-pr-labels: "override-stale" + # Limit the No. of API calls in one run default value is 30. + operations-per-run: 500 + # Prevent to remove stale label when PRs or issues are updated. + remove-stale-when-updated: true + # List of labels to remove when issues/PRs unstale. + labels-to-remove-when-unstale: 'stat:awaiting response' + # comment on issue if not active for more then 7 days. + stale-issue-message: 'This issue has been marked stale because it has no recent activity since 7 days. It will be closed if no further activity occurs. Thank you.' + # comment on PR if not active for more then 14 days. + stale-pr-message: 'This PR has been marked stale because it has no recent activity since 14 days. It will be closed if no further activity occurs. Thank you.' + # comment on issue if stale for more then 7 days. + close-issue-message: This issue was closed due to lack of activity after being marked stale for past 7 days. + # comment on PR if stale for more then 14 days. + close-pr-message: This PR was closed due to lack of activity after being marked stale for past 14 days. + # Number of days of inactivity before an Issue Request becomes stale + days-before-issue-stale: 7 + # Number of days of inactivity before a stale Issue is closed + days-before-issue-close: 7 + # reason for closed the issue default value is not_planned + close-issue-reason: completed + # Number of days of inactivity before a stale PR is closed + days-before-pr-close: 14 + # Number of days of inactivity before an PR Request becomes stale + days-before-pr-stale: 14 + # Check for label to stale or close the issue/PR + any-of-labels: 'stat:awaiting response' + # override stale to stalled for PR + stale-pr-label: 'stale' + # override stale to stalled for Issue + stale-issue-label: "stale" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..525f087 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +bazel-* +mediapipe/MediaPipe.xcodeproj +mediapipe/MediaPipe.tulsiproj/*.tulsiconf-user +mediapipe/provisioning_profile.mobileprovision +node_modules/ +.configure.bazelrc +.user.bazelrc diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..262642b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,75 @@ +{ + "C_Cpp.errorSquiggles": "disabled", + "files.associations": { + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "chrono": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "condition_variable": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "list": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "semaphore": "cpp", + "span": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "thread": "cpp", + "cinttypes": "cpp", + "typeindex": "cpp", + "typeinfo": "cpp", + "variant": "cpp", + "any": "cpp", + "fstream": "cpp", + "*.inc": "cpp" + } +} \ No newline at end of file diff --git a/BUILD.bazel b/BUILD.bazel new file mode 100644 index 0000000..e3443b8 --- /dev/null +++ b/BUILD.bazel @@ -0,0 +1,22 @@ +# Copyright 2022 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +licenses(["notice"]) + +exports_files([ + "LICENSE", + "tsconfig.json", + "package.json", + "yarn.lock", +]) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..df7772a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,36 @@ +# Contributing guidelines + +## What type of pull request do we accept into MediaPipe repository? + +* Bug fixes +* Documentation fixes + +For new feature additions (e.g., new graphs and calculators), we are currently not planning to accept new feature pull requests into the MediaPipe repository. Instead, we like to get contributors to create their own repositories of the new feature and list it at [Awesome MediaPipe](https://mediapipe.page.link/awesome-mediapipe). This will allow contributors to more quickly get their code out to the community. + +Before sending your pull requests, make sure you followed this list. + +- Read [contributing guidelines](CONTRIBUTING.md). +- Read [Code of Conduct](CODE_OF_CONDUCT.md). +- Ensure you have signed the [Contributor License Agreement (CLA)](https://cla.developers.google.com/). + +## How to become a contributor and submit your own code + +### Contributor License Agreements + +We'd love to accept your patches! Before we can take them, we have to jump a couple of legal hurdles. + +Please fill out either the individual or corporate Contributor License Agreement (CLA). + + * If you are an individual writing original source code and you're sure you own the intellectual property, then you'll need to sign an [individual CLA](https://code.google.com/legal/individual-cla-v1.0.html). + * If you work for a company that wants to allow you to contribute your work, then you'll need to sign a [corporate CLA](https://code.google.com/legal/corporate-cla-v1.0.html). + +Follow either of the two links above to access the appropriate CLA and instructions for how to sign and return it. Once we receive it, we'll be able to accept your pull requests. + +***NOTE***: Only original source code from you and other people that have signed the CLA can be accepted into the main repository. + +### Contributing code + +If you have bug fixes and documentation fixes to MediaPipe, send us your pull requests! For those +just getting started, GitHub has a [howto](https://help.github.com/articles/using-pull-requests/). + +MediaPipe team members will be assigned to review your pull requests. Once the bug/documentation fixes are verified, a MediaPipe team member will acknowledge your contribution in the pull request comments, manually merge the fixes into our internal codebase upstream, and apply the `to be closed` label to the pull request. These fixes will later be pushed to GitHub in the next release, and a MediaPipe team member will then close the pull request. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0e03e39 --- /dev/null +++ b/LICENSE @@ -0,0 +1,218 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +=========================================================================== +For files under tasks/cc/text/language_detector/custom_ops/utils/utf/ +=========================================================================== +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..37900d0 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,10 @@ +global-exclude .git* +global-exclude *_test.py + +include CONTRIBUTING.md +include LICENSE +include MANIFEST.in +include README.md +include requirements.txt + +recursive-include mediapipe/modules *.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..e4f5dd1 --- /dev/null +++ b/README.md @@ -0,0 +1,158 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe +title: Home +nav_order: 1 +--- + +---- + +**Attention:** *We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +![MediaPipe](https://developers.google.com/static/mediapipe/images/home/hero_01_1920.png) + +**Attention**: MediaPipe Solutions Preview is an early release. [Learn +more](https://developers.google.com/mediapipe/solutions/about#notice). + +**On-device machine learning for everyone** + +Delight your customers with innovative machine learning features. MediaPipe +contains everything that you need to customize and deploy to mobile (Android, +iOS), web, desktop, edge devices, and IoT, effortlessly. + +* [See demos](https://goo.gle/mediapipe-studio) +* [Learn more](https://developers.google.com/mediapipe/solutions) + +## Get started + +You can get started with MediaPipe Solutions by by checking out any of the +developer guides for +[vision](https://developers.google.com/mediapipe/solutions/vision/object_detector), +[text](https://developers.google.com/mediapipe/solutions/text/text_classifier), +and +[audio](https://developers.google.com/mediapipe/solutions/audio/audio_classifier) +tasks. If you need help setting up a development environment for use with +MediaPipe Tasks, check out the setup guides for +[Android](https://developers.google.com/mediapipe/solutions/setup_android), [web +apps](https://developers.google.com/mediapipe/solutions/setup_web), and +[Python](https://developers.google.com/mediapipe/solutions/setup_python). + +## Solutions + +MediaPipe Solutions provides a suite of libraries and tools for you to quickly +apply artificial intelligence (AI) and machine learning (ML) techniques in your +applications. You can plug these solutions into your applications immediately, +customize them to your needs, and use them across multiple development +platforms. MediaPipe Solutions is part of the MediaPipe [open source +project](https://github.com/google/mediapipe), so you can further customize the +solutions code to meet your application needs. + +These libraries and resources provide the core functionality for each MediaPipe +Solution: + +* **MediaPipe Tasks**: Cross-platform APIs and libraries for deploying + solutions. [Learn + more](https://developers.google.com/mediapipe/solutions/tasks). +* **MediaPipe models**: Pre-trained, ready-to-run models for use with each + solution. + +These tools let you customize and evaluate solutions: + +* **MediaPipe Model Maker**: Customize models for solutions with your data. + [Learn more](https://developers.google.com/mediapipe/solutions/model_maker). +* **MediaPipe Studio**: Visualize, evaluate, and benchmark solutions in your + browser. [Learn + more](https://developers.google.com/mediapipe/solutions/studio). + +### Legacy solutions + +We have ended support for [these MediaPipe Legacy Solutions](https://developers.google.com/mediapipe/solutions/guide#legacy) +as of March 1, 2023. All other MediaPipe Legacy Solutions will be upgraded to +a new MediaPipe Solution. See the [Solutions guide](https://developers.google.com/mediapipe/solutions/guide#legacy) +for details. The [code repository](https://github.com/google/mediapipe/tree/master/mediapipe) +and prebuilt binaries for all MediaPipe Legacy Solutions will continue to be +provided on an as-is basis. + +For more on the legacy solutions, see the [documentation](https://github.com/google/mediapipe/tree/master/docs/solutions). + +## Framework + +To start using MediaPipe Framework, [install MediaPipe +Framework](https://developers.google.com/mediapipe/framework/getting_started/install) +and start building example applications in C++, Android, and iOS. + +[MediaPipe Framework](https://developers.google.com/mediapipe/framework) is the +low-level component used to build efficient on-device machine learning +pipelines, similar to the premade MediaPipe Solutions. + +Before using MediaPipe Framework, familiarize yourself with the following key +[Framework +concepts](https://developers.google.com/mediapipe/framework/framework_concepts/overview.md): + +* [Packets](https://developers.google.com/mediapipe/framework/framework_concepts/packets.md) +* [Graphs](https://developers.google.com/mediapipe/framework/framework_concepts/graphs.md) +* [Calculators](https://developers.google.com/mediapipe/framework/framework_concepts/calculators.md) + +## Community + +* [Slack community](https://mediapipe.page.link/joinslack) for MediaPipe + users. +* [Discuss](https://groups.google.com/forum/#!forum/mediapipe) - General + community discussion around MediaPipe. +* [Awesome MediaPipe](https://mediapipe.page.link/awesome-mediapipe) - A + curated list of awesome MediaPipe related frameworks, libraries and + software. + +## Contributing + +We welcome contributions. Please follow these +[guidelines](https://github.com/google/mediapipe/blob/master/CONTRIBUTING.md). + +We use GitHub issues for tracking requests and bugs. Please post questions to +the MediaPipe Stack Overflow with a `mediapipe` tag. + +## Resources + +### Publications + +* [Bringing artworks to life with AR](https://developers.googleblog.com/2021/07/bringing-artworks-to-life-with-ar.html) + in Google Developers Blog +* [Prosthesis control via Mirru App using MediaPipe hand tracking](https://developers.googleblog.com/2021/05/control-your-mirru-prosthesis-with-mediapipe-hand-tracking.html) + in Google Developers Blog +* [SignAll SDK: Sign language interface using MediaPipe is now available for + developers](https://developers.googleblog.com/2021/04/signall-sdk-sign-language-interface-using-mediapipe-now-available.html) + in Google Developers Blog +* [MediaPipe Holistic - Simultaneous Face, Hand and Pose Prediction, on + Device](https://ai.googleblog.com/2020/12/mediapipe-holistic-simultaneous-face.html) + in Google AI Blog +* [Background Features in Google Meet, Powered by Web ML](https://ai.googleblog.com/2020/10/background-features-in-google-meet.html) + in Google AI Blog +* [MediaPipe 3D Face Transform](https://developers.googleblog.com/2020/09/mediapipe-3d-face-transform.html) + in Google Developers Blog +* [Instant Motion Tracking With MediaPipe](https://developers.googleblog.com/2020/08/instant-motion-tracking-with-mediapipe.html) + in Google Developers Blog +* [BlazePose - On-device Real-time Body Pose Tracking](https://ai.googleblog.com/2020/08/on-device-real-time-body-pose-tracking.html) + in Google AI Blog +* [MediaPipe Iris: Real-time Eye Tracking and Depth Estimation](https://ai.googleblog.com/2020/08/mediapipe-iris-real-time-iris-tracking.html) + in Google AI Blog +* [MediaPipe KNIFT: Template-based feature matching](https://developers.googleblog.com/2020/04/mediapipe-knift-template-based-feature-matching.html) + in Google Developers Blog +* [Alfred Camera: Smart camera features using MediaPipe](https://developers.googleblog.com/2020/03/alfred-camera-smart-camera-features-using-mediapipe.html) + in Google Developers Blog +* [Real-Time 3D Object Detection on Mobile Devices with MediaPipe](https://ai.googleblog.com/2020/03/real-time-3d-object-detection-on-mobile.html) + in Google AI Blog +* [AutoFlip: An Open Source Framework for Intelligent Video Reframing](https://ai.googleblog.com/2020/02/autoflip-open-source-framework-for.html) + in Google AI Blog +* [MediaPipe on the Web](https://developers.googleblog.com/2020/01/mediapipe-on-web.html) + in Google Developers Blog +* [Object Detection and Tracking using MediaPipe](https://developers.googleblog.com/2019/12/object-detection-and-tracking-using-mediapipe.html) + in Google Developers Blog +* [On-Device, Real-Time Hand Tracking with MediaPipe](https://ai.googleblog.com/2019/08/on-device-real-time-hand-tracking-with.html) + in Google AI Blog +* [MediaPipe: A Framework for Building Perception Pipelines](https://arxiv.org/abs/1906.08172) + +### Videos + +* [YouTube Channel](https://www.youtube.com/c/MediaPipe) diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 0000000..fd13e9b --- /dev/null +++ b/WORKSPACE @@ -0,0 +1,682 @@ +workspace(name = "mediapipe") + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +# Protobuf expects an //external:python_headers target +bind( + name = "python_headers", + actual = "@local_config_python//:python_headers", +) + +http_archive( + name = "bazel_skylib", + sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506", + urls = [ + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz", + "https://github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz", + ], +) +load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace") +bazel_skylib_workspace() +load("@bazel_skylib//lib:versions.bzl", "versions") +versions.check(minimum_bazel_version = "3.7.2") + +# ABSL on 2023-10-18 +http_archive( + name = "com_google_absl", + urls = ["https://github.com/abseil/abseil-cpp/archive//9687a8ea750bfcddf790372093245a1d041b21a3.tar.gz"], + patches = [ + "@//third_party:com_google_absl_windows_patch.diff" + ], + patch_args = [ + "-p1", + ], + strip_prefix = "abseil-cpp-9687a8ea750bfcddf790372093245a1d041b21a3", + sha256 = "f841f78243f179326f2a80b719f2887c38fe226d288ecdc46e2aa091e6aa43bc", +) + +http_archive( + name = "rules_cc", + strip_prefix = "rules_cc-2f8c04c04462ab83c545ab14c0da68c3b4c96191", +# The commit can be updated if the build passes. Last updated 6/23/22. + urls = ["https://github.com/bazelbuild/rules_cc/archive/2f8c04c04462ab83c545ab14c0da68c3b4c96191.zip"], +) + +http_archive( + name = "rules_foreign_cc", + sha256 = "2a4d07cd64b0719b39a7c12218a3e507672b82a97b98c6a89d38565894cf7c51", + strip_prefix = "rules_foreign_cc-0.9.0", + url = "https://github.com/bazelbuild/rules_foreign_cc/archive/refs/tags/0.9.0.tar.gz", +) + +load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies") + +rules_foreign_cc_dependencies() + +http_archive( + name = "rules_java", + sha256 = "c73336802d0b4882e40770666ad055212df4ea62cfa6edf9cb0f9d29828a0934", + url = "https://github.com/bazelbuild/rules_java/releases/download/5.3.5/rules_java-5.3.5.tar.gz", +) + +http_archive( + name = "com_google_protobuf", + sha256 = "87407cd28e7a9c95d9f61a098a53cf031109d451a7763e7dd1253abf8b4df422", + strip_prefix = "protobuf-3.19.1", + urls = ["https://github.com/protocolbuffers/protobuf/archive/v3.19.1.tar.gz"], + patches = [ + "@//third_party:com_google_protobuf_fixes.diff" + ], + patch_args = [ + "-p1", + ], +) + +http_archive( + name = "cpuinfo", + sha256 = "a615cac78fad03952cc3e1fd231ce789a8df6e81a5957b64350cb8200364b385", + strip_prefix = "cpuinfo-d6860c477c99f1fce9e28eb206891af3c0e1a1d7", + urls = [ + "https://github.com/pytorch/cpuinfo/archive/d6860c477c99f1fce9e28eb206891af3c0e1a1d7.zip" + ], +) + +# XNNPACK on 2024-03-27. +http_archive( + name = "XNNPACK", + # `curl -L | shasum -a 256` + sha256 = "179a680ef85deb5380b850f2551b214e00835c232f5b197dedf7c011a6adf5a6", + strip_prefix = "XNNPACK-2fe25b859581a34e77b48b06c640ac1a5a58612e", + url = "https://github.com/google/XNNPACK/archive/2fe25b859581a34e77b48b06c640ac1a5a58612e.zip", +) + +# TODO: This is an are indirect depedency. We should factor it out. +http_archive( + name = "pthreadpool", + sha256 = "a4cf06de57bfdf8d7b537c61f1c3071bce74e57524fe053e0bbd2332feca7f95", + strip_prefix = "pthreadpool-4fe0e1e183925bf8cfa6aae24237e724a96479b8", + urls = ["https://github.com/Maratyszcza/pthreadpool/archive/4fe0e1e183925bf8cfa6aae24237e724a96479b8.zip"], +) + +# Load Zlib before initializing TensorFlow and the iOS build rules to guarantee +# that the target @zlib//:mini_zlib is available +http_archive( + name = "zlib", + build_file = "@//third_party:zlib.BUILD", + sha256 = "b3a24de97a8fdbc835b9833169501030b8977031bcb54b3b3ac13740f846ab30", + strip_prefix = "zlib-1.2.13", + url = "http://zlib.net/fossils/zlib-1.2.13.tar.gz", + patches = [ + "@//third_party:zlib.diff", + ], + patch_args = [ + "-p1", + ], +) + +# iOS basic build deps. +http_archive( + name = "build_bazel_rules_apple", + sha256 = "3e2c7ae0ddd181c4053b6491dad1d01ae29011bc322ca87eea45957c76d3a0c3", + url = "https://github.com/bazelbuild/rules_apple/releases/download/2.1.0/rules_apple.2.1.0.tar.gz", + patches = [ + # Bypass checking ios unit test runner when building MP ios applications. + "@//third_party:build_bazel_rules_apple_bypass_test_runner_check.diff" + ], + patch_args = [ + "-p1", + ], +) + +load( + "@build_bazel_rules_apple//apple:repositories.bzl", + "apple_rules_dependencies", +) +apple_rules_dependencies() + +load( + "@build_bazel_rules_swift//swift:repositories.bzl", + "swift_rules_dependencies", +) +swift_rules_dependencies() + +load( + "@build_bazel_rules_swift//swift:extras.bzl", + "swift_rules_extra_dependencies", +) +swift_rules_extra_dependencies() + +load( + "@build_bazel_apple_support//lib:repositories.bzl", + "apple_support_dependencies", +) +apple_support_dependencies() + +# This is used to select all contents of the archives for CMake-based packages to give CMake access to them. +all_content = """filegroup(name = "all", srcs = glob(["**"]), visibility = ["//visibility:public"])""" + +# GoogleTest/GoogleMock framework. Used by most unit-tests. +# Last updated 2021-07-02. +http_archive( + name = "com_google_googletest", + urls = ["https://github.com/google/googletest/archive/4ec4cd23f486bf70efcc5d2caa40f24368f752e3.zip"], + strip_prefix = "googletest-4ec4cd23f486bf70efcc5d2caa40f24368f752e3", + sha256 = "de682ea824bfffba05b4e33b67431c247397d6175962534305136aa06f92e049", +) + +# Google Benchmark library v1.6.1 released on 2022-01-10. +http_archive( + name = "com_google_benchmark", + urls = ["https://github.com/google/benchmark/archive/refs/tags/v1.6.1.tar.gz"], + strip_prefix = "benchmark-1.6.1", + sha256 = "6132883bc8c9b0df5375b16ab520fac1a85dc9e4cf5be59480448ece74b278d4", + build_file = "@//third_party:benchmark.BUILD", +) + +# gflags needed by glog +http_archive( + name = "com_github_gflags_gflags", + strip_prefix = "gflags-2.2.2", + sha256 = "19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5", + url = "https://github.com/gflags/gflags/archive/v2.2.2.zip", +) + +# 2020-08-21 +http_archive( + name = "com_github_glog_glog", + strip_prefix = "glog-0.6.0", + sha256 = "8a83bf982f37bb70825df71a9709fa90ea9f4447fb3c099e1d720a439d88bad6", + urls = [ + "https://github.com/google/glog/archive/v0.6.0.tar.gz", + ], +) +http_archive( + name = "com_github_glog_glog_no_gflags", + strip_prefix = "glog-0.6.0", + sha256 = "8a83bf982f37bb70825df71a9709fa90ea9f4447fb3c099e1d720a439d88bad6", + build_file = "@//third_party:glog_no_gflags.BUILD", + urls = [ + "https://github.com/google/glog/archive/v0.6.0.tar.gz", + ], + patches = [ + "@//third_party:com_github_glog_glog.diff", + ], + patch_args = [ + "-p1", + ], +) + +# 2023-06-05 +# This version of Glog is required for Windows support, but currently causes +# crashes on some Android devices. +http_archive( + name = "com_github_glog_glog_windows", + strip_prefix = "glog-3a0d4d22c5ae0b9a2216988411cfa6bf860cc372", + sha256 = "170d08f80210b82d95563f4723a15095eff1aad1863000e8eeb569c96a98fefb", + urls = [ + "https://github.com/google/glog/archive/3a0d4d22c5ae0b9a2216988411cfa6bf860cc372.zip", + ], + patches = [ + "@//third_party:com_github_glog_glog.diff", + "@//third_party:com_github_glog_glog_windows_patch.diff", + ], + patch_args = [ + "-p1", + ], +) + +# easyexif +http_archive( + name = "easyexif", + url = "https://github.com/mayanklahiri/easyexif/archive/master.zip", + strip_prefix = "easyexif-master", + build_file = "@//third_party:easyexif.BUILD", +) + +# libyuv +http_archive( + name = "libyuv", + # Error: operand type mismatch for `vbroadcastss' caused by commit 8a13626e42f7fdcf3a6acbb0316760ee54cda7d8. + urls = ["https://chromium.googlesource.com/libyuv/libyuv/+archive/2525698acba9bf9b701ba6b4d9584291a1f62257.tar.gz"], + build_file = "@//third_party:libyuv.BUILD", +) + +# Note: protobuf-javalite is no longer released as a separate download, it's included in the main Java download. +# ...but the Java download is currently broken, so we use the "source" download. +http_archive( + name = "com_google_protobuf_javalite", + sha256 = "87407cd28e7a9c95d9f61a098a53cf031109d451a7763e7dd1253abf8b4df422", + strip_prefix = "protobuf-3.19.1", + urls = ["https://github.com/protocolbuffers/protobuf/archive/v3.19.1.tar.gz"], +) + +load("@//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo") +flatbuffers() + +http_archive( + name = "com_google_audio_tools", + strip_prefix = "multichannel-audio-tools-1f6b1319f13282eda6ff1317be13de67f4723860", + urls = ["https://github.com/google/multichannel-audio-tools/archive/1f6b1319f13282eda6ff1317be13de67f4723860.zip"], + sha256 = "fe346e1aee4f5069c4cbccb88706a9a2b2b4cf98aeb91ec1319be77e07dd7435", + repo_mapping = {"@com_github_glog_glog" : "@com_github_glog_glog_no_gflags"}, + # TODO: Fix this in AudioTools directly + patches = ["@//third_party:com_google_audio_tools_fixes.diff"], + patch_args = ["-p1"] +) + +http_archive( + name = "pffft", + strip_prefix = "jpommier-pffft-7c3b5a7dc510", + urls = ["https://bitbucket.org/jpommier/pffft/get/7c3b5a7dc510.zip"], + build_file = "@//third_party:pffft.BUILD", +) + +# Sentencepiece +http_archive( + name = "com_google_sentencepiece", + strip_prefix = "sentencepiece-0.1.96", + add_prefix = "sentencepiece", + sha256 = "8409b0126ebd62b256c685d5757150cf7fcb2b92a2f2b98efb3f38fc36719754", + urls = [ + "https://github.com/google/sentencepiece/archive/refs/tags/v0.1.96.zip" + ], + build_file = "@//third_party:sentencepiece.BUILD", + patches = ["@//third_party:com_google_sentencepiece.diff"], + patch_args = ["-d", "sentencepiece", "-p1"], +) + +http_archive( + name = "darts_clone", + build_file = "@//third_party:darts_clone.BUILD", + sha256 = "c97f55d05c98da6fcaf7f9ecc6a6dc6bc5b18b8564465f77abff8879d446491c", + strip_prefix = "darts-clone-e40ce4627526985a7767444b6ed6893ab6ff8983", + urls = [ + "https://github.com/s-yata/darts-clone/archive/e40ce4627526985a7767444b6ed6893ab6ff8983.zip", + ], +) + +http_archive( + name = "org_tensorflow_text", + sha256 = "f64647276f7288d1b1fe4c89581d51404d0ce4ae97f2bcc4c19bd667549adca8", + strip_prefix = "text-2.2.0", + urls = [ + "https://github.com/tensorflow/text/archive/v2.2.0.zip", + ], + patches = [ + "@//third_party:tensorflow_text_remove_tf_deps.diff", + "@//third_party:tensorflow_text_a0f49e63.diff", + ], + patch_args = ["-p1"], + repo_mapping = {"@com_google_re2": "@com_googlesource_code_re2"}, +) + +http_archive( + name = "com_googlesource_code_re2", + sha256 = "ef516fb84824a597c4d5d0d6d330daedb18363b5a99eda87d027e6bdd9cba299", + strip_prefix = "re2-03da4fc0857c285e3a26782f6bc8931c4c950df4", + urls = [ + "https://github.com/google/re2/archive/03da4fc0857c285e3a26782f6bc8931c4c950df4.tar.gz", + ], +) + +# 2020-07-09 +http_archive( + name = "pybind11_bazel", + strip_prefix = "pybind11_bazel-203508e14aab7309892a1c5f7dd05debda22d9a5", + urls = ["https://github.com/pybind/pybind11_bazel/archive/203508e14aab7309892a1c5f7dd05debda22d9a5.zip"], + sha256 = "75922da3a1bdb417d820398eb03d4e9bd067c4905a4246d35a44c01d62154d91", +) + +# 2022-10-20 +http_archive( + name = "pybind11", + urls = [ + "https://github.com/pybind/pybind11/archive/v2.10.1.zip", + ], + sha256 = "fcf94065efcfd0a7a828bacf118fa11c43f6390d0c805e3e6342ac119f2e9976", + strip_prefix = "pybind11-2.10.1", + build_file = "@pybind11_bazel//:pybind11.BUILD", +) + +http_archive( + name = "pybind11_protobuf", + sha256 = "baa1f53568283630a5055c85f0898b8810f7a6431bd01bbaedd32b4c1defbcb1", + strip_prefix = "pybind11_protobuf-3594106f2df3d725e65015ffb4c7886d6eeee683", + urls = [ + "https://github.com/pybind/pybind11_protobuf/archive/3594106f2df3d725e65015ffb4c7886d6eeee683.tar.gz", + ], +) + +# Point to the commit that deprecates the usage of Eigen::MappedSparseMatrix. +http_archive( + name = "ceres_solver", + url = "https://github.com/ceres-solver/ceres-solver/archive/123fba61cf2611a3c8bddc9d91416db26b10b558.zip", + patches = [ + "@//third_party:ceres_solver_compatibility_fixes.diff" + ], + patch_args = [ + "-p1", + ], + strip_prefix = "ceres-solver-123fba61cf2611a3c8bddc9d91416db26b10b558", + sha256 = "8b7b16ceb363420e0fd499576daf73fa338adb0b1449f58bea7862766baa1ac7" +) + +http_archive( + name = "opencv", + build_file_content = all_content, + strip_prefix = "opencv-3.4.10", + urls = ["https://github.com/opencv/opencv/archive/3.4.10.tar.gz"], +) + +new_local_repository( + name = "linux_opencv", + build_file = "@//third_party:opencv_linux.BUILD", + path = "/usr", +) + +new_local_repository( + name = "linux_ffmpeg", + build_file = "@//third_party:ffmpeg_linux.BUILD", + path = "/usr" +) + +new_local_repository( + name = "macos_opencv", + build_file = "@//third_party:opencv_macos.BUILD", + # For local MacOS builds, the path should point to an opencv@3 installation. + # If you edit the path here, you will also need to update the corresponding + # prefix in "opencv_macos.BUILD". + path = "/usr/local", # e.g. /usr/local/Cellar for HomeBrew +) + +new_local_repository( + name = "macos_ffmpeg", + build_file = "@//third_party:ffmpeg_macos.BUILD", + path = "/usr/local/opt/ffmpeg", +) + +new_local_repository( + name = "windows_opencv", + build_file = "@//third_party:opencv_windows.BUILD", + path = "C:\\opencv\\build", +) + +http_archive( + name = "android_opencv", + build_file = "@//third_party:opencv_android.BUILD", + strip_prefix = "OpenCV-android-sdk", + type = "zip", + url = "https://github.com/opencv/opencv/releases/download/3.4.3/opencv-3.4.3-android-sdk.zip", +) + +# After OpenCV 3.2.0, the pre-compiled opencv2.framework has google protobuf symbols, which will +# trigger duplicate symbol errors in the linking stage of building a mediapipe ios app. +# To get a higher version of OpenCV for iOS, opencv2.framework needs to be built from source with +# '-DBUILD_PROTOBUF=OFF -DBUILD_opencv_dnn=OFF'. +http_archive( + name = "ios_opencv", + sha256 = "7dd536d06f59e6e1156b546bd581523d8df92ce83440002885ec5abc06558de2", + build_file = "@//third_party:opencv_ios.BUILD", + type = "zip", + url = "https://github.com/opencv/opencv/releases/download/3.2.0/opencv-3.2.0-ios-framework.zip", +) + +# Building an opencv.xcframework from the OpenCV 4.5.3 sources is necessary for +# MediaPipe iOS Task Libraries to be supported on arm64(M1) Macs. An +# `opencv.xcframework` archive has not been released and it is recommended to +# build the same from source using a script provided in OpenCV 4.5.0 upwards. +# OpenCV is fixed to version to 4.5.3 since swift support can only be disabled +# from 4.5.3 upwards. This is needed to avoid errors when the library is linked +# in Xcode. Swift support will be added in when the final binary MediaPipe iOS +# Task libraries are built. +http_archive( + name = "ios_opencv_source", + sha256 = "a61e7a4618d353140c857f25843f39b2abe5f451b018aab1604ef0bc34cd23d5", + build_file = "@//third_party:opencv_ios_source.BUILD", + type = "zip", + url = "https://github.com/opencv/opencv/archive/refs/tags/4.5.3.zip", +) + +http_archive( + name = "stblib", + strip_prefix = "stb-b42009b3b9d4ca35bc703f5310eedc74f584be58", + sha256 = "13a99ad430e930907f5611325ec384168a958bf7610e63e60e2fd8e7b7379610", + urls = ["https://github.com/nothings/stb/archive/b42009b3b9d4ca35bc703f5310eedc74f584be58.tar.gz"], + build_file = "@//third_party:stblib.BUILD", + patches = [ + "@//third_party:stb_image_impl.diff" + ], + patch_args = [ + "-p1", + ], +) + +# More iOS deps. + +http_archive( + name = "google_toolbox_for_mac", + url = "https://github.com/google/google-toolbox-for-mac/archive/v2.2.1.zip", + sha256 = "e3ac053813c989a88703556df4dc4466e424e30d32108433ed6beaec76ba4fdc", + strip_prefix = "google-toolbox-for-mac-2.2.1", + build_file = "@//third_party:google_toolbox_for_mac.BUILD", +) + +# Maven dependencies. + +RULES_JVM_EXTERNAL_TAG = "4.0" +RULES_JVM_EXTERNAL_SHA = "31701ad93dbfe544d597dbe62c9a1fdd76d81d8a9150c2bf1ecf928ecdf97169" + +http_archive( + name = "rules_jvm_external", + strip_prefix = "rules_jvm_external-%s" % RULES_JVM_EXTERNAL_TAG, + sha256 = RULES_JVM_EXTERNAL_SHA, + url = "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG, +) + +load("@rules_jvm_external//:defs.bzl", "maven_install") + +# Important: there can only be one maven_install rule. Add new maven deps here. +maven_install( + artifacts = [ + "androidx.concurrent:concurrent-futures:1.0.0-alpha03", + "androidx.lifecycle:lifecycle-common:2.3.1", + "androidx.activity:activity:1.2.2", + "androidx.exifinterface:exifinterface:1.3.3", + "androidx.fragment:fragment:1.3.4", + "androidx.annotation:annotation:aar:1.1.0", + "androidx.appcompat:appcompat:aar:1.1.0-rc01", + "androidx.camera:camera-core:1.0.0-beta10", + "androidx.camera:camera-camera2:1.0.0-beta10", + "androidx.camera:camera-lifecycle:1.0.0-beta10", + "androidx.constraintlayout:constraintlayout:aar:1.1.3", + "androidx.core:core:aar:1.1.0-rc03", + "androidx.legacy:legacy-support-v4:aar:1.0.0", + "androidx.recyclerview:recyclerview:aar:1.1.0-beta02", + "androidx.test.espresso:espresso-core:3.1.1", + "com.github.bumptech.glide:glide:4.11.0", + "com.google.android.material:material:aar:1.0.0-rc01", + "com.google.auto.value:auto-value:1.8.1", + "com.google.auto.value:auto-value-annotations:1.8.1", + "com.google.code.findbugs:jsr305:latest.release", + "com.google.android.datatransport:transport-api:3.0.0", + "com.google.android.datatransport:transport-backend-cct:3.1.0", + "com.google.android.datatransport:transport-runtime:3.1.0", + "com.google.flogger:flogger-system-backend:0.6", + "com.google.flogger:flogger:0.6", + "com.google.guava:guava:27.0.1-android", + "com.google.guava:listenablefuture:1.0", + "junit:junit:4.12", + "org.hamcrest:hamcrest-library:1.3", + ], + repositories = [ + "https://maven.google.com", + "https://dl.google.com/dl/android/maven2", + "https://repo1.maven.org/maven2", + "https://jcenter.bintray.com", + ], + fetch_sources = True, + version_conflict_policy = "pinned", +) + +# Needed by TensorFlow +http_archive( + name = "io_bazel_rules_closure", + sha256 = "e0a111000aeed2051f29fcc7a3f83be3ad8c6c93c186e64beb1ad313f0c7f9f9", + strip_prefix = "rules_closure-cf1e44edb908e9616030cc83d085989b8e6cd6df", + urls = [ + "http://mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/cf1e44edb908e9616030cc83d085989b8e6cd6df.tar.gz", + "https://github.com/bazelbuild/rules_closure/archive/cf1e44edb908e9616030cc83d085989b8e6cd6df.tar.gz", # 2019-04-04 + ], +) + +# TensorFlow repo should always go after the other external dependencies. +# TF on 2024-05-09. +_TENSORFLOW_GIT_COMMIT = "8038e44ea38bb889095afaaf6ad05e94adaed8d2" +# curl -L https://github.com/tensorflow/tensorflow/archive/8038e44ea38bb889095afaaf6ad05e94adaed8d2.tar.gz | shasum -a 256 +_TENSORFLOW_SHA256 = "a00c1503a879eb21c349941bbee54aef8d557d7d2ab770e76fb26668d75aa6e0" +http_archive( + name = "org_tensorflow", + urls = [ + "https://github.com/tensorflow/tensorflow/archive/%s.tar.gz" % _TENSORFLOW_GIT_COMMIT, + ], + patches = [ + "@//third_party:org_tensorflow_system_python.diff", + # Diff is generated with a script, don't update it manually. + "@//third_party:org_tensorflow_custom_ops.diff", + # Works around Bazel issue with objc_library. + # See https://github.com/bazelbuild/bazel/issues/19912 + "@//third_party:org_tensorflow_objc_build_fixes.diff", + # Restores scores for text pipelines, which return different results + # with subgraph reshaping + "@//third_party:org_tensorflow_disable_subgraph_reshaping.diff" + ], + patch_args = [ + "-p1", + ], + strip_prefix = "tensorflow-%s" % _TENSORFLOW_GIT_COMMIT, + sha256 = _TENSORFLOW_SHA256, +) + +load("@org_tensorflow//tensorflow:workspace3.bzl", "tf_workspace3") +tf_workspace3() +load("@org_tensorflow//tensorflow:workspace2.bzl", "tf_workspace2") +tf_workspace2() + +# Edge TPU +http_archive( + name = "libedgetpu", + sha256 = "14d5527a943a25bc648c28a9961f954f70ba4d79c0a9ca5ae226e1831d72fe80", + strip_prefix = "libedgetpu-3164995622300286ef2bb14d7fdc2792dae045b7", + urls = [ + "https://github.com/google-coral/libedgetpu/archive/3164995622300286ef2bb14d7fdc2792dae045b7.tar.gz" + ], +) +load("@libedgetpu//:workspace.bzl", "libedgetpu_dependencies") +libedgetpu_dependencies() + +load("@coral_crosstool//:configure.bzl", "cc_crosstool") +cc_crosstool(name = "crosstool") + + +# Node dependencies +http_archive( + name = "build_bazel_rules_nodejs", + sha256 = "94070eff79305be05b7699207fbac5d2608054dd53e6109f7d00d923919ff45a", + urls = ["https://github.com/bazelbuild/rules_nodejs/releases/download/5.8.2/rules_nodejs-5.8.2.tar.gz"], +) + +load("@build_bazel_rules_nodejs//:repositories.bzl", "build_bazel_rules_nodejs_dependencies") +build_bazel_rules_nodejs_dependencies() + +# fetches nodejs, npm, and yarn +load("@build_bazel_rules_nodejs//:index.bzl", "node_repositories", "yarn_install") +node_repositories() +yarn_install( + name = "npm", + package_json = "@//:package.json", + yarn_lock = "@//:yarn.lock", +) + +# Protobuf for Node dependencies +http_archive( + name = "rules_proto_grpc", + sha256 = "bbe4db93499f5c9414926e46f9e35016999a4e9f6e3522482d3760dc61011070", + strip_prefix = "rules_proto_grpc-4.2.0", + urls = ["https://github.com/rules-proto-grpc/rules_proto_grpc/archive/4.2.0.tar.gz"], +) + +http_archive( + name = "com_google_protobuf_javascript", + sha256 = "35bca1729532b0a77280bf28ab5937438e3dcccd6b31a282d9ae84c896b6f6e3", + strip_prefix = "protobuf-javascript-3.21.2", + urls = ["https://github.com/protocolbuffers/protobuf-javascript/archive/refs/tags/v3.21.2.tar.gz"], +) + +load("@rules_proto_grpc//:repositories.bzl", "rules_proto_grpc_toolchains", "rules_proto_grpc_repos") +rules_proto_grpc_toolchains() +rules_proto_grpc_repos() + +load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains") +rules_proto_dependencies() +rules_proto_toolchains() + +load("@//third_party:external_files.bzl", "external_files") +external_files() + +load("@//third_party:wasm_files.bzl", "wasm_files") +wasm_files() + +# Halide + +new_local_repository( + name = "halide", + build_file = "@//third_party/halide:BUILD.bazel", + path = "third_party/halide" +) + +http_archive( + name = "linux_halide", + sha256 = "d290fadf3f358c94aacf43c883de6468bb98883e26116920afd491ec0e440cd2", + strip_prefix = "Halide-15.0.1-x86-64-linux", + urls = ["https://github.com/halide/Halide/releases/download/v15.0.1/Halide-15.0.1-x86-64-linux-4c63f1befa1063184c5982b11b6a2cc17d4e5815.tar.gz"], + build_file = "@//third_party:halide.BUILD", +) + +http_archive( + name = "macos_x86_64_halide", + sha256 = "48ff073ac1aee5c4aca941a4f043cac64b38ba236cdca12567e09d803594a61c", + strip_prefix = "Halide-15.0.1-x86-64-osx", + urls = ["https://github.com/halide/Halide/releases/download/v15.0.1/Halide-15.0.1-x86-64-osx-4c63f1befa1063184c5982b11b6a2cc17d4e5815.tar.gz"], + build_file = "@//third_party:halide.BUILD", +) + +http_archive( + name = "macos_arm_64_halide", + sha256 = "db5d20d75fa7463490fcbc79c89f0abec9c23991f787c8e3e831fff411d5395c", + strip_prefix = "Halide-15.0.1-arm-64-osx", + urls = ["https://github.com/halide/Halide/releases/download/v15.0.1/Halide-15.0.1-arm-64-osx-4c63f1befa1063184c5982b11b6a2cc17d4e5815.tar.gz"], + build_file = "@//third_party:halide.BUILD", +) + +http_archive( + name = "windows_halide", + sha256 = "61fd049bd75ee918ac6c30d0693aac6048f63f8d1fc4db31001573e58eae8dae", + strip_prefix = "Halide-15.0.1-x86-64-windows", + urls = ["https://github.com/halide/Halide/releases/download/v15.0.1/Halide-15.0.1-x86-64-windows-4c63f1befa1063184c5982b11b6a2cc17d4e5815.zip"], + build_file = "@//third_party:halide.BUILD", +) + +http_archive( + name = "pybind11_abseil", + sha256 = "0223b647b8cc817336a51e787980ebc299c8d5e64c069829bf34b69d72337449", + strip_prefix = "pybind11_abseil-2c4932ed6f6204f1656e245838f4f5eae69d2e29", + urls = ["https://github.com/pybind/pybind11_abseil/archive/2c4932ed6f6204f1656e245838f4f5eae69d2e29.tar.gz"], +) + +http_archive( + name = "com_github_nlohmann_json", + sha256 = "6bea5877b1541d353bd77bdfbdb2696333ae5ed8f9e8cc22df657192218cad91", + urls = ["https://github.com/nlohmann/json/releases/download/v3.9.1/include.zip"], + build_file = "@//third_party:nlohmann.BUILD", +) + diff --git a/build_android_examples.sh b/build_android_examples.sh new file mode 100644 index 0000000..6dbdd66 --- /dev/null +++ b/build_android_examples.sh @@ -0,0 +1,143 @@ +#!/bin/bash +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +# +# Script to build all MediaPipe Android example apps. +# +# To build all apps and store them in out_dir, and install them: +# $ ./build_android_examples.sh -d out_dir +# Omitting -d and the associated directory saves all generated APKs in the +# current directory. +# $ ./build_android_examples.sh -d out_dir --nostrip +# Same as above except that the symnbols are not stripped. +# +# To install the apps already stored in out_dir (after building them with the +# usages above): +# $ ./build_android_examples.sh -d out_dir -i +# Omitting -d and the associated directory assumes the apps are in the +# current directory. + +set -e + +function switch_to_opencv_3() { + echo "Switching to OpenCV 3" + sed -i -e 's:4.0.1/opencv-4.0.1:3.4.3/opencv-3.4.3:g' WORKSPACE + sed -i -e 's:libopencv_java4:libopencv_java3:g' third_party/opencv_android.BUILD +} + +function switch_to_opencv_4() { + echo "Switching to OpenCV 4" + sed -i -e 's:3.4.3/opencv-3.4.3:4.0.1/opencv-4.0.1:g' WORKSPACE + sed -i -e 's:libopencv_java3:libopencv_java4:g' third_party/opencv_android.BUILD +} + +out_dir="." +strip=true +install_only=false +app_dir="mediapipe/examples/android/src/java/com/google/mediapipe/apps" +bin_dir="bazel-bin" +declare -a default_bazel_flags=(build -c opt --config=android_arm64) + +while [[ -n $1 ]]; do + case $1 in + -d) + shift + out_dir=$1 + ;; + --nostrip) + strip=false + ;; + -i) + install_only=true + ;; + *) + echo "Unsupported input argument $1." + exit 1 + ;; + esac + shift +done + +echo "app_dir: $app_dir" +echo "out_dir: $out_dir" +echo "strip: $strip" + +declare -a apks=() +declare -a bazel_flags +switch_to_opencv_3 + +apps="${app_dir}/*" +for app in ${apps}; do + if [[ -d "${app}" ]]; then + app_name=${app##*/} + if [[ ${app_name} == "basic" ]]; then + target_name="helloworld" + else + target_name=${app_name} + fi + target="${app}:${target_name}" + bin="${bin_dir}/${app}/${target_name}.apk" + + echo "=== Target: ${target}" + + if [[ $install_only == false ]]; then + bazel_flags=("${default_bazel_flags[@]}") + bazel_flags+=(${target}) + if [[ $strip == true ]]; then + bazel_flags+=(--linkopt=-s) + fi + fi + + if [[ ${app_name} == "objectdetection3d" ]]; then + categories=("shoe" "chair" "cup" "camera" "shoe_1stage" "chair_1stage") + for category in "${categories[@]}"; do + apk="${out_dir}/${target_name}_${category}.apk" + if [[ $install_only == false ]]; then + bazel_flags_extended=("${bazel_flags[@]}") + if [[ ${category} != "shoe" ]]; then + bazel_flags_extended+=(--define ${category}=true) + fi + bazelisk "${bazel_flags_extended[@]}" + cp -f "${bin}" "${apk}" + fi + apks+=(${apk}) + done + else + apk="${out_dir}/${target_name}.apk" + if [[ $install_only == false ]]; then + if [[ ${app_name} == "templatematchingcpu" ]]; then + switch_to_opencv_4 + fi + bazelisk "${bazel_flags[@]}" + cp -f "${bin}" "${apk}" + if [[ ${app_name} == "templatematchingcpu" ]]; then + switch_to_opencv_3 + fi + fi + apks+=(${apk}) + fi + fi +done + +echo +echo "Connect your device via adb to install the apps." +read -p "Press 'a' to abort, or press any other key to continue ..." -n 1 -r +echo +if [[ ! $REPLY =~ ^[Aa]$ ]]; then + for apk in "${apks[@]}"; do + echo "=== Installing $apk" + adb install -r "${apk}" + done +fi diff --git a/build_desktop_examples.sh b/build_desktop_examples.sh new file mode 100644 index 0000000..5bc687f --- /dev/null +++ b/build_desktop_examples.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +# +# Script to build/run all MediaPipe desktop example apps (with webcam input). +# +# To build and run all apps and store them in out_dir: +# $ ./build_desktop_examples.sh -d out_dir +# Omitting -d and the associated directory saves all generated apps in the +# current directory. +# To build all apps and store them in out_dir: +# $ ./build_desktop_examples.sh -d out_dir -b +# Omitting -d and the associated directory saves all generated apps in the +# current directory. +# To run all apps already stored in out_dir: +# $ ./build_desktop_examples.sh -d out_dir -r +# Omitting -d and the associated directory assumes all apps are in the current +# directory. + +set -e + +out_dir="." +build_only=false +run_only=false +app_dir="mediapipe/examples/desktop" +bin_dir="bazel-bin" +declare -a default_bazel_flags=(build -c opt --define MEDIAPIPE_DISABLE_GPU=1) + +while [[ -n $1 ]]; do + case $1 in + -d) + shift + out_dir=$1 + ;; + -b) + build_only=true + ;; + -r) + run_only=true + ;; + *) + echo "Unsupported input argument $1." + exit 1 + ;; + esac + shift +done + +echo "app_dir: $app_dir" +echo "out_dir: $out_dir" + +declare -a bazel_flags + +apps="${app_dir}/*" +for app in ${apps}; do + if [[ -d "${app}" ]]; then + target_name=${app##*/} + if [[ "${target_name}" == "autoflip" || + "${target_name}" == "hello_world" || + "${target_name}" == "media_sequence" || + "${target_name}" == "object_detection_3d" || + "${target_name}" == "template_matching" || + "${target_name}" == "youtube8m" ]]; then + continue + fi + target="${app}:${target_name}_cpu" + + echo "=== Target: ${target}" + + if [[ $run_only == false ]]; then + bazel_flags=("${default_bazel_flags[@]}") + bazel_flags+=(${target}) + + bazelisk "${bazel_flags[@]}" + cp -f "${bin_dir}/${app}/"*"_cpu" "${out_dir}" + fi + if [[ $build_only == false ]]; then + if [[ ${target_name} == "object_tracking" ]]; then + graph_name="tracking/object_detection_tracking" + elif [[ ${target_name} == "upper_body_pose_tracking" ]]; then + graph_name="pose_tracking/upper_body_pose_tracking" + else + graph_name="${target_name}/${target_name}" + fi + if [[ ${target_name} == "holistic_tracking" || + ${target_name} == "iris_tracking" || + ${target_name} == "pose_tracking" || + ${target_name} == "selfie_segmentation" || + ${target_name} == "upper_body_pose_tracking" ]]; then + graph_suffix="cpu" + else + graph_suffix="desktop_live" + fi + GLOG_logtostderr=1 "${out_dir}/${target_name}_cpu" \ + --calculator_graph_config_file=mediapipe/graphs/"${graph_name}_${graph_suffix}.pbtxt" + fi + fi +done diff --git a/build_ios_examples.sh b/build_ios_examples.sh new file mode 100644 index 0000000..e6a2271 --- /dev/null +++ b/build_ios_examples.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# Copyright 2020 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========================================================================= +# +# Script to build all MediaPipe iOS example apps. +# +# To build all apps and store them in out_dir: +# $ ./build_ios_examples.sh -d out_dir +# Omitting -d and the associated directory saves all generated IPAs in the +# current directory. +# $ ./build_ios_examples.sh -d out_dir --nostrip +# Same as above except that the symnbols are not stripped. + +set -e + +out_dir="." +strip=true +app_dir="mediapipe/examples/ios" +bin_dir="bazel-bin" +declare -a default_bazel_flags=(build -c opt --config=ios_arm64) + +while [[ -n $1 ]]; do + case $1 in + -d) + shift + out_dir=$1 + ;; + --nostrip) + strip=false + ;; + *) + echo "Unsupported input argument $1." + exit 1 + ;; + esac + shift +done + +echo "app_dir: $app_dir" +echo "out_dir: $out_dir" +echo "strip: $strip" + +declare -a bazel_flags + +apps="${app_dir}/*" +for app in ${apps}; do + if [[ -d "${app}" ]]; then + target_name=${app##*/} + if [[ "${target_name}" == "common" ]]; then + continue + fi + target="${app}:${target_name}" + + echo "=== Target: ${target}" + + bazel_flags=("${default_bazel_flags[@]}") + bazel_flags+=(${target}) + if [[ $strip == true ]]; then + bazel_flags+=(--linkopt=-s) + fi + + bazelisk "${bazel_flags[@]}" + cp -f "${bin_dir}/${app}/"*".ipa" "${out_dir}" + fi +done diff --git a/docs/BUILD b/docs/BUILD new file mode 100644 index 0000000..d19f709 --- /dev/null +++ b/docs/BUILD @@ -0,0 +1,31 @@ +# Placeholder for internal Python strict binary compatibility macro. + +py_binary( + name = "build_py_api_docs", + srcs = ["build_py_api_docs.py"], + deps = [ + "//third_party/py/absl:app", + "//third_party/py/absl/flags", + "//third_party/py/mediapipe", + "//third_party/py/tensorflow_docs/api_generator:generate_lib", + ], +) + +py_binary( + name = "build_java_api_docs", + srcs = ["build_java_api_docs.py"], + data = [ + "//third_party/android/sdk:api/26.txt", + "//third_party/java/doclava:doclet.jar", + "//third_party/java/jsilver:jsilver_jar", + ], + env = { + "DOCLAVA_JAR": "$(location //third_party/java/doclava:doclet.jar)", + "JSILVER_JAR": "$(location //third_party/java/jsilver:jsilver_jar)", + }, + deps = [ + "//third_party/py/absl:app", + "//third_party/py/absl/flags", + "//third_party/py/tensorflow_docs/api_generator/gen_java", + ], +) diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..13a5e64 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,21 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + rm -rf ./_build + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.pbxproj b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.pbxproj new file mode 100644 index 0000000..8a95288 --- /dev/null +++ b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.pbxproj @@ -0,0 +1,342 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 56; + objects = { + +/* Begin PBXBuildFile section */ + 8566B55D2ABABF9A00AAB22A /* MediaPipeTasksDocGen.h in Headers */ = {isa = PBXBuildFile; fileRef = 8566B55C2ABABF9A00AAB22A /* MediaPipeTasksDocGen.h */; settings = {ATTRIBUTES = (Public, ); }; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + 8566B5592ABABF9A00AAB22A /* MediaPipeTasksDocGen.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = MediaPipeTasksDocGen.framework; sourceTree = BUILT_PRODUCTS_DIR; }; + 8566B55C2ABABF9A00AAB22A /* MediaPipeTasksDocGen.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = MediaPipeTasksDocGen.h; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 8566B5562ABABF9A00AAB22A /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 8566B54F2ABABF9A00AAB22A = { + isa = PBXGroup; + children = ( + 8566B55B2ABABF9A00AAB22A /* MediaPipeTasksDocGen */, + 8566B55A2ABABF9A00AAB22A /* Products */, + ); + sourceTree = ""; + }; + 8566B55A2ABABF9A00AAB22A /* Products */ = { + isa = PBXGroup; + children = ( + 8566B5592ABABF9A00AAB22A /* MediaPipeTasksDocGen.framework */, + ); + name = Products; + sourceTree = ""; + }; + 8566B55B2ABABF9A00AAB22A /* MediaPipeTasksDocGen */ = { + isa = PBXGroup; + children = ( + 8566B55C2ABABF9A00AAB22A /* MediaPipeTasksDocGen.h */, + ); + path = MediaPipeTasksDocGen; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXHeadersBuildPhase section */ + 8566B5542ABABF9A00AAB22A /* Headers */ = { + isa = PBXHeadersBuildPhase; + buildActionMask = 2147483647; + files = ( + 8566B55D2ABABF9A00AAB22A /* MediaPipeTasksDocGen.h in Headers */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXHeadersBuildPhase section */ + +/* Begin PBXNativeTarget section */ + 8566B5582ABABF9A00AAB22A /* MediaPipeTasksDocGen */ = { + isa = PBXNativeTarget; + buildConfigurationList = 8566B5602ABABF9A00AAB22A /* Build configuration list for PBXNativeTarget "MediaPipeTasksDocGen" */; + buildPhases = ( + 8566B5542ABABF9A00AAB22A /* Headers */, + 8566B5552ABABF9A00AAB22A /* Sources */, + 8566B5562ABABF9A00AAB22A /* Frameworks */, + 8566B5572ABABF9A00AAB22A /* Resources */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = MediaPipeTasksDocGen; + productName = MediaPipeTasksDocGen; + productReference = 8566B5592ABABF9A00AAB22A /* MediaPipeTasksDocGen.framework */; + productType = "com.apple.product-type.framework"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + 8566B5502ABABF9A00AAB22A /* Project object */ = { + isa = PBXProject; + attributes = { + BuildIndependentTargetsInParallel = 1; + LastUpgradeCheck = 1430; + TargetAttributes = { + 8566B5582ABABF9A00AAB22A = { + CreatedOnToolsVersion = 14.3.1; + }; + }; + }; + buildConfigurationList = 8566B5532ABABF9A00AAB22A /* Build configuration list for PBXProject "MediaPipeTasksDocGen" */; + compatibilityVersion = "Xcode 14.0"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + Base, + ); + mainGroup = 8566B54F2ABABF9A00AAB22A; + productRefGroup = 8566B55A2ABABF9A00AAB22A /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + 8566B5582ABABF9A00AAB22A /* MediaPipeTasksDocGen */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXResourcesBuildPhase section */ + 8566B5572ABABF9A00AAB22A /* Resources */ = { + isa = PBXResourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXResourcesBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + 8566B5552ABABF9A00AAB22A /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + 8566B55E2ABABF9A00AAB22A /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + CURRENT_PROJECT_VERSION = 1; + DEBUG_INFORMATION_FORMAT = dwarf; + ENABLE_STRICT_OBJC_MSGSEND = YES; + ENABLE_TESTABILITY = YES; + GCC_C_LANGUAGE_STANDARD = gnu11; + GCC_DYNAMIC_NO_PIC = NO; + GCC_NO_COMMON_BLOCKS = YES; + GCC_OPTIMIZATION_LEVEL = 0; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DEBUG=1", + "$(inherited)", + ); + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.4; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + ONLY_ACTIVE_ARCH = YES; + SDKROOT = iphoneos; + SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; + SWIFT_OPTIMIZATION_LEVEL = "-Onone"; + VERSIONING_SYSTEM = "apple-generic"; + VERSION_INFO_PREFIX = ""; + }; + name = Debug; + }; + 8566B55F2ABABF9A00AAB22A /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + CLANG_ANALYZER_NONNULL = YES; + CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; + CLANG_ENABLE_MODULES = YES; + CLANG_ENABLE_OBJC_ARC = YES; + CLANG_ENABLE_OBJC_WEAK = YES; + CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; + CLANG_WARN_BOOL_CONVERSION = YES; + CLANG_WARN_COMMA = YES; + CLANG_WARN_CONSTANT_CONVERSION = YES; + CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; + CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; + CLANG_WARN_DOCUMENTATION_COMMENTS = YES; + CLANG_WARN_EMPTY_BODY = YES; + CLANG_WARN_ENUM_CONVERSION = YES; + CLANG_WARN_INFINITE_RECURSION = YES; + CLANG_WARN_INT_CONVERSION = YES; + CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; + CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; + CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; + CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; + CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; + CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; + CLANG_WARN_STRICT_PROTOTYPES = YES; + CLANG_WARN_SUSPICIOUS_MOVE = YES; + CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; + CLANG_WARN_UNREACHABLE_CODE = YES; + CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; + COPY_PHASE_STRIP = NO; + CURRENT_PROJECT_VERSION = 1; + DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; + ENABLE_NS_ASSERTIONS = NO; + ENABLE_STRICT_OBJC_MSGSEND = YES; + GCC_C_LANGUAGE_STANDARD = gnu11; + GCC_NO_COMMON_BLOCKS = YES; + GCC_WARN_64_TO_32_BIT_CONVERSION = YES; + GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; + GCC_WARN_UNDECLARED_SELECTOR = YES; + GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; + GCC_WARN_UNUSED_FUNCTION = YES; + GCC_WARN_UNUSED_VARIABLE = YES; + IPHONEOS_DEPLOYMENT_TARGET = 16.4; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + SDKROOT = iphoneos; + SWIFT_COMPILATION_MODE = wholemodule; + SWIFT_OPTIMIZATION_LEVEL = "-O"; + VALIDATE_PRODUCT = YES; + VERSIONING_SYSTEM = "apple-generic"; + VERSION_INFO_PREFIX = ""; + }; + name = Release; + }; + 8566B5612ABABF9A00AAB22A /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEFINES_MODULE = YES; + DYLIB_COMPATIBILITY_VERSION = 1; + DYLIB_CURRENT_VERSION = 1; + DYLIB_INSTALL_NAME_BASE = "@rpath"; + ENABLE_MODULE_VERIFIER = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_NSHumanReadableCopyright = ""; + INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + "@loader_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + MODULE_VERIFIER_SUPPORTED_LANGUAGES = "objective-c objective-c++"; + MODULE_VERIFIER_SUPPORTED_LANGUAGE_STANDARDS = "gnu11 gnu++20"; + PRODUCT_BUNDLE_IDENTIFIER = com.google.mediapipe.MediaPipeTasksDocGen; + PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; + SKIP_INSTALL = YES; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Debug; + }; + 8566B5622ABABF9A00AAB22A /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + CODE_SIGN_STYLE = Automatic; + CURRENT_PROJECT_VERSION = 1; + DEFINES_MODULE = YES; + DYLIB_COMPATIBILITY_VERSION = 1; + DYLIB_CURRENT_VERSION = 1; + DYLIB_INSTALL_NAME_BASE = "@rpath"; + ENABLE_MODULE_VERIFIER = YES; + GENERATE_INFOPLIST_FILE = YES; + INFOPLIST_KEY_NSHumanReadableCopyright = ""; + INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks"; + LD_RUNPATH_SEARCH_PATHS = ( + "$(inherited)", + "@executable_path/Frameworks", + "@loader_path/Frameworks", + ); + MARKETING_VERSION = 1.0; + MODULE_VERIFIER_SUPPORTED_LANGUAGES = "objective-c objective-c++"; + MODULE_VERIFIER_SUPPORTED_LANGUAGE_STANDARDS = "gnu11 gnu++20"; + PRODUCT_BUNDLE_IDENTIFIER = com.google.mediapipe.MediaPipeTasksDocGen; + PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; + SKIP_INSTALL = YES; + SWIFT_EMIT_LOC_STRINGS = YES; + SWIFT_VERSION = 5.0; + TARGETED_DEVICE_FAMILY = "1,2"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + 8566B5532ABABF9A00AAB22A /* Build configuration list for PBXProject "MediaPipeTasksDocGen" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 8566B55E2ABABF9A00AAB22A /* Debug */, + 8566B55F2ABABF9A00AAB22A /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + 8566B5602ABABF9A00AAB22A /* Build configuration list for PBXNativeTarget "MediaPipeTasksDocGen" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + 8566B5612ABABF9A00AAB22A /* Debug */, + 8566B5622ABABF9A00AAB22A /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = 8566B5502ABABF9A00AAB22A /* Project object */; +} diff --git a/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.xcworkspace/contents.xcworkspacedata new file mode 100644 index 0000000..919434a --- /dev/null +++ b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.xcworkspace/contents.xcworkspacedata @@ -0,0 +1,7 @@ + + + + + diff --git a/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist new file mode 100644 index 0000000..b3ea173 --- /dev/null +++ b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist @@ -0,0 +1,8 @@ + + + + + IDEDidComputeMac32BitWarning + + + diff --git a/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.xcworkspace/xcuserdata/macd.xcuserdatad/UserInterfaceState.xcuserstate b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.xcworkspace/xcuserdata/macd.xcuserdatad/UserInterfaceState.xcuserstate new file mode 100644 index 0000000..d667b46 Binary files /dev/null and b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/project.xcworkspace/xcuserdata/macd.xcuserdatad/UserInterfaceState.xcuserstate differ diff --git a/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/xcuserdata/macd.xcuserdatad/xcschemes/xcschememanagement.plist b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/xcuserdata/macd.xcuserdatad/xcschemes/xcschememanagement.plist new file mode 100644 index 0000000..adc534a --- /dev/null +++ b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen.xcodeproj/xcuserdata/macd.xcuserdatad/xcschemes/xcschememanagement.plist @@ -0,0 +1,14 @@ + + + + + SchemeUserState + + MediaPipeTasksDocGen.xcscheme_^#shared#^_ + + orderHint + 0 + + + + diff --git a/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen/MediaPipeTasksDocGen.h b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen/MediaPipeTasksDocGen.h new file mode 100644 index 0000000..2ce44b2 --- /dev/null +++ b/docs/MediaPipeTasksDocGen/MediaPipeTasksDocGen/MediaPipeTasksDocGen.h @@ -0,0 +1,17 @@ +// +// MediaPipeTasksDocGen.h +// MediaPipeTasksDocGen +// +// Created by Mark McDonald on 20/9/2023. +// + +#import + +//! Project version number for MediaPipeTasksDocGen. +FOUNDATION_EXPORT double MediaPipeTasksDocGenVersionNumber; + +//! Project version string for MediaPipeTasksDocGen. +FOUNDATION_EXPORT const unsigned char MediaPipeTasksDocGenVersionString[]; + +// In this header, you should import all the public headers of your framework using statements like +// #import diff --git a/docs/MediaPipeTasksDocGen/Podfile b/docs/MediaPipeTasksDocGen/Podfile new file mode 100644 index 0000000..3c8d8f0 --- /dev/null +++ b/docs/MediaPipeTasksDocGen/Podfile @@ -0,0 +1,11 @@ +# Uncomment the next line to define a global platform for your project +platform :ios, '15.0' + +target 'MediaPipeTasksDocGen' do + # Comment the next line if you don't want to use dynamic frameworks + use_frameworks! + + # Pods for MediaPipeTasksDocGen + pod 'MediaPipeTasksText' + pod 'MediaPipeTasksVision' +end diff --git a/docs/MediaPipeTasksDocGen/README.md b/docs/MediaPipeTasksDocGen/README.md new file mode 100644 index 0000000..4752530 --- /dev/null +++ b/docs/MediaPipeTasksDocGen/README.md @@ -0,0 +1,9 @@ +# MediaPipeTasksDocGen + +This empty project is used to generate reference documentation for the +ObjectiveC and Swift libraries. + +Docs are generated using [Jazzy](https://github.com/realm/jazzy) and published +to [the developer site](https://developers.google.com/mediapipe/solutions/). + +To bump the API version used, edit [`Podfile`](./Podfile). diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..35d61ef --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,29 @@ +# Configuration for GitHub Pages + +remote_theme: pmarsceill/just-the-docs + +# Set a path/url to a logo that will be displayed instead of the title +logo: "images/logo_horizontal_color.png" + +# Enable or disable the site search +search_enabled: true + +# Set the search token separator for hyphenated-word search: +search_tokenizer_separator: /[\s/]+/ + +# Enable or disable heading anchors +heading_anchors: true + +# Aux links for the upper right navigation +aux_links: + "MediaPipe on GitHub": + - "//github.com/google/mediapipe" + +# Footer content appears at the bottom of every page's main content +footer_content: "© GOOGLE LLC | PRIVACY POLICY | TERMS OF SERVICE" + +# Color scheme currently only supports "dark", "light"/nil (default), or a custom scheme that you define +color_scheme: mediapipe + +# Google Analytics Tracking (optional) +ga_tracking: UA-140696581-2 diff --git a/docs/_layouts/forward.html b/docs/_layouts/forward.html new file mode 100644 index 0000000..ec97e98 --- /dev/null +++ b/docs/_layouts/forward.html @@ -0,0 +1,13 @@ + + + + + + Redirecting + + +

This page now lives on https://developers.google.com/mediapipe/. If you aren't automatically + redirected, follow this + link.

+ + diff --git a/docs/_sass/color_schemes/mediapipe.scss b/docs/_sass/color_schemes/mediapipe.scss new file mode 100644 index 0000000..6722e21 --- /dev/null +++ b/docs/_sass/color_schemes/mediapipe.scss @@ -0,0 +1 @@ +$link-color: #0097A7; diff --git a/docs/build_java_api_docs.py b/docs/build_java_api_docs.py new file mode 100644 index 0000000..c304265 --- /dev/null +++ b/docs/build_java_api_docs.py @@ -0,0 +1,74 @@ +# Copyright 2022 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Generate Java reference docs for MediaPipe.""" +import pathlib +import shutil + +from absl import app +from absl import flags + +from tensorflow_docs.api_generator import gen_java + +_OUT_DIR = flags.DEFINE_string('output_dir', '/tmp/mp_java/', + 'Write docs here.') + +_SITE_PATH = flags.DEFINE_string('site_path', '/mediapipe/api_docs/java', + 'Path prefix in the _toc.yaml') + +_ = flags.DEFINE_string('code_url_prefix', None, + '[UNUSED] The url prefix for links to code.') + +_ = flags.DEFINE_bool( + 'search_hints', True, + '[UNUSED] Include metadata search hints in the generated files') + +_ANDROID_SDK = pathlib.Path('android/sdk/api/26.txt') + + +def main(_) -> None: + # Default to using a relative path to find the Java source. + mp_root = pathlib.Path(__file__) + while (mp_root := mp_root.parent).name != 'mediapipe': + # Find the nearest `mediapipe` dir. + if not mp_root.name: + # We've hit the filesystem root - abort. + raise FileNotFoundError('"mediapipe" root not found') + + # Find the root from which all packages are relative. + root = mp_root.parent + + # Externally, parts of the repo are nested inside a mediapipe/ directory + # that does not exist internally. Support both. + if (mp_root / 'mediapipe').exists(): + mp_root = mp_root / 'mediapipe' + + # We need to copy this into the tasks dir to ensure we don't leave broken + # links in the generated docs. + old_api_dir = 'java/com/google/mediapipe/framework/image' + shutil.copytree( + mp_root / old_api_dir, + mp_root / 'tasks' / old_api_dir, + dirs_exist_ok=True) + + gen_java.gen_java_docs( + package='com.google.mediapipe', + source_path=mp_root / 'tasks/java', + output_dir=pathlib.Path(_OUT_DIR.value), + site_path=pathlib.Path(_SITE_PATH.value), + federated_docs={'https://developer.android.com': root / _ANDROID_SDK}) + + +if __name__ == '__main__': + app.run(main) diff --git a/docs/build_model_maker_api_docs.py b/docs/build_model_maker_api_docs.py new file mode 100644 index 0000000..377536c --- /dev/null +++ b/docs/build_model_maker_api_docs.py @@ -0,0 +1,81 @@ +# Copyright 2022 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""MediaPipe Model Maker reference docs generation script. + +This script generates API reference docs for the `mediapipe` PIP package. + +$> pip install -U git+https://github.com/tensorflow/docs mediapipe-model-maker +$> python build_model_maker_api_docs.py +""" + +import os + +from absl import app +from absl import flags + +from tensorflow_docs.api_generator import generate_lib + +try: + # mediapipe has not been set up to work with bazel yet, so catch & report. + import mediapipe_model_maker # pytype: disable=import-error +except ImportError as e: + raise ImportError('Please `pip install mediapipe-model-maker`.') from e + + +PROJECT_SHORT_NAME = 'mediapipe_model_maker' +PROJECT_FULL_NAME = 'MediaPipe Model Maker' + +_OUTPUT_DIR = flags.DEFINE_string( + 'output_dir', + default='/tmp/generated_docs', + help='Where to write the resulting docs.') + +_URL_PREFIX = flags.DEFINE_string( + 'code_url_prefix', + 'https://github.com/google/mediapipe/tree/master/mediapipe/model_maker', + 'The url prefix for links to code.') + +_SEARCH_HINTS = flags.DEFINE_bool( + 'search_hints', True, + 'Include metadata search hints in the generated files') + +_SITE_PATH = flags.DEFINE_string('site_path', '/mediapipe/api_docs/python', + 'Path prefix in the _toc.yaml') + + +def gen_api_docs(): + """Generates API docs for the mediapipe-model-maker package.""" + + doc_generator = generate_lib.DocGenerator( + root_title=PROJECT_FULL_NAME, + py_modules=[(PROJECT_SHORT_NAME, mediapipe_model_maker)], + base_dir=os.path.dirname(mediapipe_model_maker.__file__), + code_url_prefix=_URL_PREFIX.value, + search_hints=_SEARCH_HINTS.value, + site_path=_SITE_PATH.value, + callbacks=[], + ) + + doc_generator.build(_OUTPUT_DIR.value) + + print('Docs output to:', _OUTPUT_DIR.value) + + +def main(_): + gen_api_docs() + + +if __name__ == '__main__': + app.run(main) diff --git a/docs/build_py_api_docs.py b/docs/build_py_api_docs.py new file mode 100644 index 0000000..10b7993 --- /dev/null +++ b/docs/build_py_api_docs.py @@ -0,0 +1,83 @@ +# Copyright 2022 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""MediaPipe reference docs generation script. + +This script generates API reference docs for the `mediapipe` PIP package. + +$> pip install -U git+https://github.com/tensorflow/docs mediapipe +$> python build_py_api_docs.py +""" + +import os + +from absl import app +from absl import flags + +from tensorflow_docs.api_generator import generate_lib + +try: + # mediapipe has not been set up to work with bazel yet, so catch & report. + import mediapipe as mp # pytype: disable=import-error +except ImportError as e: + raise ImportError('Please `pip install mediapipe`.') from e + + +PROJECT_SHORT_NAME = 'mp' +PROJECT_FULL_NAME = 'MediaPipe' + +_OUTPUT_DIR = flags.DEFINE_string( + 'output_dir', + default='/tmp/generated_docs', + help='Where to write the resulting docs.') + +_URL_PREFIX = flags.DEFINE_string( + 'code_url_prefix', + 'https://github.com/google/mediapipe/blob/master/mediapipe', + 'The url prefix for links to code.') + +_SEARCH_HINTS = flags.DEFINE_bool( + 'search_hints', True, + 'Include metadata search hints in the generated files') + +_SITE_PATH = flags.DEFINE_string('site_path', '/mediapipe/api/solutions/python', + 'Path prefix in the _toc.yaml') + + +def gen_api_docs(): + """Generates API docs for the mediapipe package.""" + if hasattr(mp, 'solutions'): + del mp.solutions + + doc_generator = generate_lib.DocGenerator( + root_title=PROJECT_FULL_NAME, + py_modules=[(PROJECT_SHORT_NAME, mp)], + base_dir=os.path.dirname(mp.__file__), + code_url_prefix=_URL_PREFIX.value, + search_hints=_SEARCH_HINTS.value, + site_path=_SITE_PATH.value, + callbacks=[], + ) + + doc_generator.build(_OUTPUT_DIR.value) + + print('Docs output to:', _OUTPUT_DIR.value) + + +def main(_): + gen_api_docs() + + +if __name__ == '__main__': + app.run(main) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..f2ba642 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,57 @@ +"""Configuration file for the Sphinx documentation builder. + +This file only contains a selection of the most common options. +For a full list see the documentation: +http://www.sphinx-doc.org/en/master/config +-- Path setup -------------------------------------------------------------- +If extensions (or modules to document with autodoc) are in another directory, +add these directories to sys.path here. +If the directory is relative to the documentation root, +use os.path.abspath to make it absolute, like shown here. + +""" +import sphinx_rtd_theme + + +# -- Project information ----------------------------------------------------- + +project = 'MediaPipe' +author = 'Google LLC' + +# The full version, including alpha/beta/rc tags +release = 'v0.7.5' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'recommonmark' +] + +master_doc = 'index' + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/data/visualizer/sample_trace.binarypb b/docs/data/visualizer/sample_trace.binarypb new file mode 100644 index 0000000..fc25629 Binary files /dev/null and b/docs/data/visualizer/sample_trace.binarypb differ diff --git a/docs/framework_concepts/building_graphs_cpp.md b/docs/framework_concepts/building_graphs_cpp.md new file mode 100644 index 0000000..26cdfe1 --- /dev/null +++ b/docs/framework_concepts/building_graphs_cpp.md @@ -0,0 +1,704 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/framework_concepts/graphs_cpp +title: Building Graphs in C++ +parent: Graphs +nav_order: 1 +--- + +# Building Graphs in C++ +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +C++ graph builder is a powerful tool for: + +* Building complex graphs +* Parametrizing graphs (e.g. setting a delegate on `InferenceCalculator`, + enabling/disabling parts of the graph) +* Deduplicating graphs (e.g. instead of CPU and GPU dedicated graphs in pbtxt + you can have a single code that constructs required graphs, sharing as much + as possible) +* Supporting optional graph inputs/outputs +* Customizing graphs per platform + +## Basic Usage + +Let's see how C++ graph builder can be used for a simple graph: + +```proto +# Graph inputs. +input_stream: "input_tensors" +input_side_packet: "model" + +# Graph outputs. +output_stream: "output_tensors" + +node { + calculator: "InferenceCalculator" + input_stream: "TENSORS:input_tensors" + input_side_packet: "MODEL:model" + output_stream: "TENSORS:output_tensors" + node_options: { + [type.googleapis.com/mediapipe.InferenceCalculatorOptions] { + # Requesting GPU delegate. + delegate { gpu {} } + } + } +} +``` + +Function to build the above `CalculatorGraphConfig` may look like: + +```c++ +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Graph inputs. + Stream> input_tensors = + graph.In(0).SetName("input_tensors").Cast>(); + SidePacket model = + graph.SideIn(0).SetName("model").Cast(); + + auto& inference_node = graph.AddNode("InferenceCalculator"); + auto& inference_opts = + inference_node.GetOptions(); + // Requesting GPU delegate. + inference_opts.mutable_delegate()->mutable_gpu(); + input_tensors.ConnectTo(inference_node.In("TENSORS")); + model.ConnectTo(inference_node.SideIn("MODEL")); + Stream> output_tensors = + inference_node.Out("TENSORS").Cast>(); + + // Graph outputs. + output_tensors.SetName("output_tensors").ConnectTo(graph.Out(0)); + + // Get `CalculatorGraphConfig` to pass it into `CalculatorGraph` + return graph.GetConfig(); +} +``` + +Short summary: + +* Use `Graph::In/SideIn` to get graph inputs as `Stream/SidePacket` +* Use `Node::Out/SideOut` to get node outputs as `Stream/SidePacket` +* Use `Stream/SidePacket::ConnectTo` to connect streams and side packets to + node inputs (`Node::In/SideIn`) and graph outputs (`Graph::Out/SideOut`) + * There's a "shortcut" operator `>>` that you can use instead of + `ConnectTo` function (E.g. `x >> node.In("IN")`). +* `Stream/SidePacket::Cast` is used to cast stream or side packet of `AnyType` + (E.g. `Stream in = graph.In(0);`) to a particular type + * Using actual types instead of `AnyType` sets you on a better path for + unleashing graph builder capabilities and improving your graphs + readability. + +## Advanced Usage + +### Utility Functions + +Let's extract inference construction code into a dedicated utility function to +help for readability and code reuse: + +```c++ +// Updates graph to run inference. +Stream> RunInference( + Stream> tensors, SidePacket model, + const InferenceCalculatorOptions::Delegate& delegate, Graph& graph) { + auto& inference_node = graph.AddNode("InferenceCalculator"); + auto& inference_opts = + inference_node.GetOptions(); + *inference_opts.mutable_delegate() = delegate; + tensors.ConnectTo(inference_node.In("TENSORS")); + model.ConnectTo(inference_node.SideIn("MODEL")); + return inference_node.Out("TENSORS").Cast>(); +} + +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Graph inputs. + Stream> input_tensors = + graph.In(0).SetName("input_tensors").Cast>(); + SidePacket model = + graph.SideIn(0).SetName("model").Cast(); + + InferenceCalculatorOptions::Delegate delegate; + delegate.mutable_gpu(); + Stream> output_tensors = + RunInference(input_tensors, model, delegate, graph); + + // Graph outputs. + output_tensors.SetName("output_tensors").ConnectTo(graph.Out(0)); + + return graph.GetConfig(); +} +``` + +As a result, `RunInference` provides a clear interface stating what are the +inputs/outputs and their types. + +It can be easily reused, e.g. it's only a few lines if you want to run an extra +model inference: + +```c++ + // Run first inference. + Stream> output_tensors = + RunInference(input_tensors, model, delegate, graph); + // Run second inference on the output of the first one. + Stream> extra_output_tensors = + RunInference(output_tensors, extra_model, delegate, graph); +``` + +And you don't need to duplicate names and tags (`InferenceCalculator`, +`TENSORS`, `MODEL`) or introduce dedicated constants here and there - those +details are localized to `RunInference` function. + +Tip: extracting `RunInference` and similar functions to dedicated modules (e.g. +inference.h/cc which depends on the inference calculator) enables reuse in +graphs construction code and helps automatically pull in calculator dependencies +(e.g. no need to manually add `:inference_calculator` dep, just let your IDE +include `inference.h` and build cleaner pull in corresponding dependency). + +### Utility Classes + +And surely, it's not only about functions, in some cases it's beneficial to +introduce utility classes which can help making your graph construction code +more readable and less error prone. + +MediaPipe offers `PassThroughCalculator` calculator, which is simply passing +through its inputs: + +``` +input_stream: "float_value" +input_stream: "int_value" +input_stream: "bool_value" + +output_stream: "passed_float_value" +output_stream: "passed_int_value" +output_stream: "passed_bool_value" + +node { + calculator: "PassThroughCalculator" + input_stream: "float_value" + input_stream: "int_value" + input_stream: "bool_value" + # The order must be the same as for inputs (or you can use explicit indexes) + output_stream: "passed_float_value" + output_stream: "passed_int_value" + output_stream: "passed_bool_value" +} +``` + +Let's see the straightforward C++ construction code to create the above graph: + +```c++ +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Graph inputs. + Stream float_value = graph.In(0).SetName("float_value").Cast(); + Stream int_value = graph.In(1).SetName("int_value").Cast(); + Stream bool_value = graph.In(2).SetName("bool_value").Cast(); + + auto& pass_node = graph.AddNode("PassThroughCalculator"); + float_value.ConnectTo(pass_node.In("")[0]); + int_value.ConnectTo(pass_node.In("")[1]); + bool_value.ConnectTo(pass_node.In("")[2]); + Stream passed_float_value = pass_node.Out("")[0].Cast(); + Stream passed_int_value = pass_node.Out("")[1].Cast(); + Stream passed_bool_value = pass_node.Out("")[2].Cast(); + + // Graph outputs. + passed_float_value.SetName("passed_float_value").ConnectTo(graph.Out(0)); + passed_int_value.SetName("passed_int_value").ConnectTo(graph.Out(1)); + passed_bool_value.SetName("passed_bool_value").ConnectTo(graph.Out(2)); + + // Get `CalculatorGraphConfig` to pass it into `CalculatorGraph` + return graph.GetConfig(); +} +``` + +While `pbtxt` representation maybe error prone (when we have many inputs to pass +through), C++ code looks even worse: repeated empty tags and `Cast` calls. Let's +see how we can do better by introducing a `PassThroughNodeBuilder`: + +```c++ +class PassThroughNodeBuilder { + public: + explicit PassThroughNodeBuilder(Graph& graph) + : node_(graph.AddNode("PassThroughCalculator")) {} + + template + Stream PassThrough(Stream stream) { + stream.ConnectTo(node_.In(index_)); + return node_.Out(index_++).Cast(); + } + + private: + int index_ = 0; + GenericNode& node_; +}; +``` + +And now graph construction code can look like: + +```c++ +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Graph inputs. + Stream float_value = graph.In(0).SetName("float_value").Cast(); + Stream int_value = graph.In(1).SetName("int_value").Cast(); + Stream bool_value = graph.In(2).SetName("bool_value").Cast(); + + PassThroughNodeBuilder pass_node_builder(graph); + Stream passed_float_value = pass_node_builder.PassThrough(float_value); + Stream passed_int_value = pass_node_builder.PassThrough(int_value); + Stream passed_bool_value = pass_node_builder.PassThrough(bool_value); + + // Graph outputs. + passed_float_value.SetName("passed_float_value").ConnectTo(graph.Out(0)); + passed_int_value.SetName("passed_int_value").ConnectTo(graph.Out(1)); + passed_bool_value.SetName("passed_bool_value").ConnectTo(graph.Out(2)); + + // Get `CalculatorGraphConfig` to pass it into `CalculatorGraph` + return graph.GetConfig(); +} +``` + +Now you can't have incorrect order or index in your pass through construction +code and save some typing by guessing the type for `Cast` from the `PassThrough` +input. + +Tip: the same as for the `RunInference` function, extracting +`PassThroughNodeBuilder` and similar utility classes into dedicated modules +enables reuse in graph construction code and helps to automatically pull in the +corresponding calculator dependencies. + +## Dos and Don'ts + +### Define graph inputs at the very beginning if possible + +```c++ {.bad} +Stream RunSomething(Stream a, Stream b, Graph& graph) { + Stream c = graph.In(2).SetName("c").Cast(); // Bad. + // ... +} + +CalculatorGraphConfig BuildGraph() { + Graph graph; + + Stream a = graph.In(0).SetName("a").Cast(); + // 10/100/N lines of code. + Stream b = graph.In(1).SetName("b").Cast() // Bad. + Stream d = RunSomething(a, b, graph); + // ... + + return graph.GetConfig(); +} + +``` + +In the above code: + +* It can be hard to guess how many inputs you have in the graph. +* Can be error prone overall and hard to maintain in future (e.g. is it a + correct index? name? what if some inputs are removed or made optional? + etc.). +* `RunSomething` reuse is limited because other graphs may have different + inputs + +Instead, define your graph inputs at the very beginning of your graph builder: + +```c++ {.good} +Stream RunSomething(Stream a, Stream b, Stream c, Graph& graph) { + // ... +} + +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Inputs. + Stream a = graph.In(0).SetName("a").Cast(); + Stream b = graph.In(1).SetName("b").Cast(); + Stream c = graph.In(2).SetName("c").Cast(); + + // 10/100/N lines of code. + Stream d = RunSomething(a, b, c, graph); + // ... + + return graph.GetConfig(); +} +``` + +Use `std::optional` if you have an input stream or side packet that is not +always defined and put it at the very beginning: + +```c++ {.good} +std::optional> a; +if (needs_a) { + a = graph.In(0).SetName(a).Cast(); +} +``` + +Note: of course, there can be exceptions - for example, there can be a use case +where calling `RunSomething1(..., graph)`, ..., `RunSomethingN(..., graph)` is +**intended to add new inputs**, so afterwards you can iterate over them and feed +only added inputs into the graph. However, in any case, try to make it easy for +readers to find out what graph inputs it has or may have. + +### Define graph outputs at the very end + +```c++ {.bad} +void RunSomething(Stream input, Graph& graph) { + // ... + node.Out("OUTPUT_F") + .SetName("output_f").ConnectTo(graph.Out(2)); // Bad. +} + +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // 10/100/N lines of code. + node.Out("OUTPUT_D") + .SetName("output_d").ConnectTo(graph.Out(0)); // Bad. + // 10/100/N lines of code. + node.Out("OUTPUT_E") + .SetName("output_e").ConnectTo(graph.Out(1)); // Bad. + // 10/100/N lines of code. + RunSomething(input, graph); + // ... + + return graph.GetConfig(); +} +``` + +In the above code: + +* It can be hard to guess how many outputs you have in the graph. +* Can be error prone overall and hard to maintain in future (e.g. is it a + correct index? name? what if some outpus are removed or made optional? + etc.). +* `RunSomething` reuse is limited as other graphs may have different outputs + +Instead, define your graph outputs at the very end of your graph builder: + +```c++ {.good} +Stream RunSomething(Stream input, Graph& graph) { + // ... + return node.Out("OUTPUT_F").Cast(); +} + +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // 10/100/N lines of code. + Stream d = node.Out("OUTPUT_D").Cast(); + // 10/100/N lines of code. + Stream e = node.Out("OUTPUT_E").Cast(); + // 10/100/N lines of code. + Stream f = RunSomething(input, graph); + // ... + + // Outputs. + d.SetName("output_d").ConnectTo(graph.Out(0)); + e.SetName("output_e").ConnectTo(graph.Out(1)); + f.SetName("output_f").ConnectTo(graph.Out(2)); + + return graph.GetConfig(); +} +``` + +### Keep nodes decoupled from each other + +In MediaPipe, packet streams and side packets are as meaningful as processing +nodes. And any node input requirements and output products are expressed clearly +and independently in terms of the streams and side packets it consumes and +produces. + +```c++ {.bad} +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Inputs. + Stream a = graph.In(0).Cast(); + + auto& node1 = graph.AddNode("Calculator1"); + a.ConnectTo(node1.In("INPUT")); + + auto& node2 = graph.AddNode("Calculator2"); + node1.Out("OUTPUT").ConnectTo(node2.In("INPUT")); // Bad. + + auto& node3 = graph.AddNode("Calculator3"); + node1.Out("OUTPUT").ConnectTo(node3.In("INPUT_B")); // Bad. + node2.Out("OUTPUT").ConnectTo(node3.In("INPUT_C")); // Bad. + + auto& node4 = graph.AddNode("Calculator4"); + node1.Out("OUTPUT").ConnectTo(node4.In("INPUT_B")); // Bad. + node2.Out("OUTPUT").ConnectTo(node4.In("INPUT_C")); // Bad. + node3.Out("OUTPUT").ConnectTo(node4.In("INPUT_D")); // Bad. + + // Outputs. + node1.Out("OUTPUT").SetName("b").ConnectTo(graph.Out(0)); // Bad. + node2.Out("OUTPUT").SetName("c").ConnectTo(graph.Out(1)); // Bad. + node3.Out("OUTPUT").SetName("d").ConnectTo(graph.Out(2)); // Bad. + node4.Out("OUTPUT").SetName("e").ConnectTo(graph.Out(3)); // Bad. + + return graph.GetConfig(); +} +``` + +In the above code: + +* Nodes are coupled to each other, e.g. `node4` knows where its inputs are + coming from (`node1`, `node2`, `node3`) and it complicates refactoring, + maintenance and code reuse + * Such usage pattern is a downgrade from proto representation, where nodes + are decoupled by default. +* `node#.Out("OUTPUT")` calls are duplicated and readability suffers as you + could use cleaner names instead and also provide an actual type. + +So, to fix the above issues you can write the following graph construction code: + +```c++ {.good} +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Inputs. + Stream a = graph.In(0).Cast(); + + // `node1` usage is limited to 3 lines below. + auto& node1 = graph.AddNode("Calculator1"); + a.ConnectTo(node1.In("INPUT")); + Stream b = node1.Out("OUTPUT").Cast(); + + // `node2` usage is limited to 3 lines below. + auto& node2 = graph.AddNode("Calculator2"); + b.ConnectTo(node2.In("INPUT")); + Stream c = node2.Out("OUTPUT").Cast(); + + // `node3` usage is limited to 4 lines below. + auto& node3 = graph.AddNode("Calculator3"); + b.ConnectTo(node3.In("INPUT_B")); + c.ConnectTo(node3.In("INPUT_C")); + Stream d = node3.Out("OUTPUT").Cast(); + + // `node4` usage is limited to 5 lines below. + auto& node4 = graph.AddNode("Calculator4"); + b.ConnectTo(node4.In("INPUT_B")); + c.ConnectTo(node4.In("INPUT_C")); + d.ConnectTo(node4.In("INPUT_D")); + Stream e = node4.Out("OUTPUT").Cast(); + + // Outputs. + b.SetName("b").ConnectTo(graph.Out(0)); + c.SetName("c").ConnectTo(graph.Out(1)); + d.SetName("d").ConnectTo(graph.Out(2)); + e.SetName("e").ConnectTo(graph.Out(3)); + + return graph.GetConfig(); +} +``` + +Now, if needed, you can easily remove `node1` and make `b` a graph input and no +updates are needed to `node2`, `node3`, `node4` (same as in proto representation +by the way), because they are decoupled from each other. + +Overall, the above code replicates the proto graph more closely: + +```proto +input_stream: "a" + +node { + calculator: "Calculator1" + input_stream: "INPUT:a" + output_stream: "OUTPUT:b" +} + +node { + calculator: "Calculator2" + input_stream: "INPUT:b" + output_stream: "OUTPUT:C" +} + +node { + calculator: "Calculator3" + input_stream: "INPUT_B:b" + input_stream: "INPUT_C:c" + output_stream: "OUTPUT:d" +} + +node { + calculator: "Calculator4" + input_stream: "INPUT_B:b" + input_stream: "INPUT_C:c" + input_stream: "INPUT_D:d" + output_stream: "OUTPUT:e" +} + +output_stream: "b" +output_stream: "c" +output_stream: "d" +output_stream: "e" +``` + +On top of that, now you can extract utility functions for further reuse in other graphs: + +```c++ {.good} +Stream RunCalculator1(Stream a, Graph& graph) { + auto& node = graph.AddNode("Calculator1"); + a.ConnectTo(node.In("INPUT")); + return node.Out("OUTPUT").Cast(); +} + +Stream RunCalculator2(Stream b, Graph& graph) { + auto& node = graph.AddNode("Calculator2"); + b.ConnectTo(node.In("INPUT")); + return node.Out("OUTPUT").Cast(); +} + +Stream RunCalculator3(Stream b, Stream c, Graph& graph) { + auto& node = graph.AddNode("Calculator3"); + b.ConnectTo(node.In("INPUT_B")); + c.ConnectTo(node.In("INPUT_C")); + return node.Out("OUTPUT").Cast(); +} + +Stream RunCalculator4(Stream b, Stream c, Stream d, Graph& graph) { + auto& node = graph.AddNode("Calculator4"); + b.ConnectTo(node.In("INPUT_B")); + c.ConnectTo(node.In("INPUT_C")); + d.ConnectTo(node.In("INPUT_D")); + return node.Out("OUTPUT").Cast(); +} + +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Inputs. + Stream a = graph.In(0).Cast(); + + Stream b = RunCalculator1(a, graph); + Stream c = RunCalculator2(b, graph); + Stream d = RunCalculator3(b, c, graph); + Stream e = RunCalculator4(b, c, d, graph); + + // Outputs. + b.SetName("b").ConnectTo(graph.Out(0)); + c.SetName("c").ConnectTo(graph.Out(1)); + d.SetName("d").ConnectTo(graph.Out(2)); + e.SetName("e").ConnectTo(graph.Out(3)); + + return graph.GetConfig(); +} +``` + +### Separate nodes for better readability + +```c++ {.bad} +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Inputs. + Stream a = graph.In(0).Cast(); + auto& node1 = graph.AddNode("Calculator1"); + a.ConnectTo(node1.In("INPUT")); + Stream b = node1.Out("OUTPUT").Cast(); + auto& node2 = graph.AddNode("Calculator2"); + b.ConnectTo(node2.In("INPUT")); + Stream c = node2.Out("OUTPUT").Cast(); + auto& node3 = graph.AddNode("Calculator3"); + b.ConnectTo(node3.In("INPUT_B")); + c.ConnectTo(node3.In("INPUT_C")); + Stream d = node3.Out("OUTPUT").Cast(); + auto& node4 = graph.AddNode("Calculator4"); + b.ConnectTo(node4.In("INPUT_B")); + c.ConnectTo(node4.In("INPUT_C")); + d.ConnectTo(node4.In("INPUT_D")); + Stream e = node4.Out("OUTPUT").Cast(); + // Outputs. + b.SetName("b").ConnectTo(graph.Out(0)); + c.SetName("c").ConnectTo(graph.Out(1)); + d.SetName("d").ConnectTo(graph.Out(2)); + e.SetName("e").ConnectTo(graph.Out(3)); + + return graph.GetConfig(); +} +``` + +In the above code, it can be hard to grasp the idea where each node begins and +ends. To improve this and help your code readers, you can simply have blank +lines before and after each node: + +```c++ {.good} +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Inputs. + Stream a = graph.In(0).Cast(); + + auto& node1 = graph.AddNode("Calculator1"); + a.ConnectTo(node1.In("INPUT")); + Stream b = node1.Out("OUTPUT").Cast(); + + auto& node2 = graph.AddNode("Calculator2"); + b.ConnectTo(node2.In("INPUT")); + Stream c = node2.Out("OUTPUT").Cast(); + + auto& node3 = graph.AddNode("Calculator3"); + b.ConnectTo(node3.In("INPUT_B")); + c.ConnectTo(node3.In("INPUT_C")); + Stream d = node3.Out("OUTPUT").Cast(); + + auto& node4 = graph.AddNode("Calculator4"); + b.ConnectTo(node4.In("INPUT_B")); + c.ConnectTo(node4.In("INPUT_C")); + d.ConnectTo(node4.In("INPUT_D")); + Stream e = node4.Out("OUTPUT").Cast(); + + // Outputs. + b.SetName("b").ConnectTo(graph.Out(0)); + c.SetName("c").ConnectTo(graph.Out(1)); + d.SetName("d").ConnectTo(graph.Out(2)); + e.SetName("e").ConnectTo(graph.Out(3)); + + return graph.GetConfig(); +} +``` + +Also, the above representation matches `CalculatorGraphConfig` proto +representation better. + +If you extract nodes into utility functions, they are scoped within functions +already and it's clear where they begin and end, so it's completely fine to +have: + +```c++ {.good} +CalculatorGraphConfig BuildGraph() { + Graph graph; + + // Inputs. + Stream a = graph.In(0).Cast(); + + Stream b = RunCalculator1(a, graph); + Stream c = RunCalculator2(b, graph); + Stream d = RunCalculator3(b, c, graph); + Stream e = RunCalculator4(b, c, d, graph); + + // Outputs. + b.SetName("b").ConnectTo(graph.Out(0)); + c.SetName("c").ConnectTo(graph.Out(1)); + d.SetName("d").ConnectTo(graph.Out(2)); + e.SetName("e").ConnectTo(graph.Out(3)); + + return graph.GetConfig(); +} +``` diff --git a/docs/framework_concepts/calculators.md b/docs/framework_concepts/calculators.md new file mode 100644 index 0000000..3a3661d --- /dev/null +++ b/docs/framework_concepts/calculators.md @@ -0,0 +1,471 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/framework_concepts/calculators +title: Calculators +parent: Framework Concepts +nav_order: 1 +--- + +# Calculators +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +Each calculator is a node of a graph. We describe how to create a new +calculator, how to initialize a calculator, how to perform its calculations, +input and output streams, timestamps, and options. Each node in the graph is +implemented as a `Calculator`. The bulk of graph execution happens inside its +calculators. A calculator may receive zero or more input streams and/or side +packets and produces zero or more output streams and/or side packets. + +## CalculatorBase + +A calculator is created by defining a new sub-class of the +[`CalculatorBase`](https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator_base.cc) +class, implementing a number of methods, and registering the new sub-class with +Mediapipe. At a minimum, a new calculator must implement the below four methods + +* `GetContract()` + * Calculator authors can specify the expected types of inputs and outputs + of a calculator in GetContract(). When a graph is initialized, the + framework calls a static method to verify if the packet types of the + connected inputs and outputs match the information in this + specification. +* `Open()` + * After a graph starts, the framework calls `Open()`. The input side + packets are available to the calculator at this point. `Open()` + interprets the node configuration operations (see [Graphs](graphs.md)) + and prepares the calculator's per-graph-run state. This function may + also write packets to calculator outputs. An error during `Open()` can + terminate the graph run. +* `Process()` + * For a calculator with inputs, the framework calls `Process()` repeatedly + whenever at least one input stream has a packet available. The framework + by default guarantees that all inputs have the same timestamp (see + [Synchronization](synchronization.md) for more information). Multiple + `Process()` calls can be invoked simultaneously when parallel execution + is enabled. If an error occurs during `Process()`, the framework calls + `Close()` and the graph run terminates. +* `Close()` + * After all calls to `Process()` finish or when all input streams close, + the framework calls `Close()`. This function is always called if + `Open()` was called and succeeded and even if the graph run terminated + because of an error. No inputs are available via any input streams + during `Close()`, but it still has access to input side packets and + therefore may write outputs. After `Close()` returns, the calculator + should be considered a dead node. The calculator object is destroyed as + soon as the graph finishes running. + +The following are code snippets from +[CalculatorBase.h](https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator_base.h). + +```c++ +class CalculatorBase { + public: + ... + + // The subclasses of CalculatorBase must implement GetContract. + // ... + static absl::Status GetContract(CalculatorContract* cc); + + // Open is called before any Process() calls, on a freshly constructed + // calculator. Subclasses may override this method to perform necessary + // setup, and possibly output Packets and/or set output streams' headers. + // ... + virtual absl::Status Open(CalculatorContext* cc) { + return absl::OkStatus(); + } + + // Processes the incoming inputs. May call the methods on cc to access + // inputs and produce outputs. + // ... + virtual absl::Status Process(CalculatorContext* cc) = 0; + + // Is called if Open() was called and succeeded. Is called either + // immediately after processing is complete or after a graph run has ended + // (if an error occurred in the graph). ... + virtual absl::Status Close(CalculatorContext* cc) { + return absl::OkStatus(); + } + + ... +}; +``` + +## Life of a calculator + +During initialization of a MediaPipe graph, the framework calls a +`GetContract()` static method to determine what kinds of packets are expected. + +The framework constructs and destroys the entire calculator for each graph run +(e.g. once per video or once per image). Expensive or large objects that remain +constant across graph runs should be supplied as input side packets so the +calculations are not repeated on subsequent runs. + +After initialization, for each run of the graph, the following sequence occurs: + +* `Open()` +* `Process()` (repeatedly) +* `Close()` + +The framework calls `Open()` to initialize the calculator. `Open()` should +interpret any options and set up the calculator's per-graph-run state. `Open()` +may obtain input side packets and write packets to calculator outputs. If +appropriate, it should call `SetOffset()` to reduce potential packet buffering +of input streams. + +If an error occurs during `Open()` or `Process()` (as indicated by one of them +returning a non-`Ok` status), the graph run is terminated with no further calls +to the calculator's methods, and the calculator is destroyed. + +For a calculator with inputs, the framework calls `Process()` whenever at least +one input has a packet available. The framework guarantees that inputs all have +the same timestamp, that timestamps increase with each call to `Process()` and +that all packets are delivered. As a consequence, some inputs may not have any +packets when `Process()` is called. An input whose packet is missing appears to +produce an empty packet (with no timestamp). + +The framework calls `Close()` after all calls to `Process()`. All inputs will +have been exhausted, but `Close()` has access to input side packets and may +write outputs. After Close returns, the calculator is destroyed. + +Calculators with no inputs are referred to as sources. A source calculator +continues to have `Process()` called as long as it returns an `Ok` status. A +source calculator indicates that it is exhausted by returning a stop status +(i.e. [`mediaPipe::tool::StatusStop()`](https://github.com/google/mediapipe/tree/master/mediapipe/framework/tool/status_util.cc).). + +## Identifying inputs and outputs + +The public interface to a calculator consists of a set of input streams and +output streams. In a CalculatorGraphConfiguration, the outputs from some +calculators are connected to the inputs of other calculators using named +streams. Stream names are normally lowercase, while input and output tags are +normally UPPERCASE. In the example below, the output with tag name `VIDEO` is +connected to the input with tag name `VIDEO_IN` using the stream named +`video_stream`. + +```proto +# Graph describing calculator SomeAudioVideoCalculator +node { + calculator: "SomeAudioVideoCalculator" + input_stream: "INPUT:combined_input" + output_stream: "VIDEO:video_stream" +} +node { + calculator: "SomeVideoCalculator" + input_stream: "VIDEO_IN:video_stream" + output_stream: "VIDEO_OUT:processed_video" +} +``` + +Input and output streams can be identified by index number, by tag name, or by a +combination of tag name and index number. You can see some examples of input and +output identifiers in the example below. `SomeAudioVideoCalculator` identifies +its video output by tag and its audio outputs by the combination of tag and +index. The input with tag `VIDEO` is connected to the stream named +`video_stream`. The outputs with tag `AUDIO` and indices `0` and `1` are +connected to the streams named `audio_left` and `audio_right`. +`SomeAudioCalculator` identifies its audio inputs by index only (no tag needed). + +```proto +# Graph describing calculator SomeAudioVideoCalculator +node { + calculator: "SomeAudioVideoCalculator" + input_stream: "combined_input" + output_stream: "VIDEO:video_stream" + output_stream: "AUDIO:0:audio_left" + output_stream: "AUDIO:1:audio_right" +} + +node { + calculator: "SomeAudioCalculator" + input_stream: "audio_left" + input_stream: "audio_right" + output_stream: "audio_energy" +} +``` + +In the calculator implementation, inputs and outputs are also identified by tag +name and index number. In the function below input and output are identified: + +* By index number: The combined input stream is identified simply by index + `0`. +* By tag name: The video output stream is identified by tag name "VIDEO". +* By tag name and index number: The output audio streams are identified by the + combination of the tag name `AUDIO` and the index numbers `0` and `1`. + +```c++ +// c++ Code snippet describing the SomeAudioVideoCalculator GetContract() method +class SomeAudioVideoCalculator : public CalculatorBase { + public: + static absl::Status GetContract(CalculatorContract* cc) { + cc->Inputs().Index(0).SetAny(); + // SetAny() is used to specify that whatever the type of the + // stream is, it's acceptable. This does not mean that any + // packet is acceptable. Packets in the stream still have a + // particular type. SetAny() has the same effect as explicitly + // setting the type to be the stream's type. + cc->Outputs().Tag("VIDEO").Set(); + cc->Outputs().Get("AUDIO", 0).Set(); + cc->Outputs().Get("AUDIO", 1).Set(); + return absl::OkStatus(); + } +``` + +## Processing + +`Process()` called on a non-source node must return `absl::OkStatus()` to +indicate that all went well, or any other status code to signal an error + +If a non-source calculator returns `tool::StatusStop()`, then this signals the +graph is being cancelled early. In this case, all source calculators and graph +input streams will be closed (and remaining Packets will propagate through the +graph). + +A source node in a graph will continue to have `Process()` called on it as long +as it returns `absl::OkStatus(`). To indicate that there is no more data to be +generated return `tool::StatusStop()`. Any other status indicates an error has +occurred. + +`Close()` returns `absl::OkStatus()` to indicate success. Any other status +indicates a failure. + +Here is the basic `Process()` function. It uses the `Input()` method (which can +be used only if the calculator has a single input) to request its input data. It +then uses `std::unique_ptr` to allocate the memory needed for the output packet, +and does the calculations. When done it releases the pointer when adding it to +the output stream. + +```c++ +absl::Status MyCalculator::Process() { + const Matrix& input = Input()->Get(); + std::unique_ptr output(new Matrix(input.rows(), input.cols())); + // do your magic here.... + // output->row(n) = ... + Output()->Add(output.release(), InputTimestamp()); + return absl::OkStatus(); +} +``` + +## Calculator options + +Calculators accept processing parameters through (1) input stream packets (2) +input side packets, and (3) calculator options. Calculator options, if +specified, appear as literal values in the `node_options` field of the +`CalculatorGraphConfiguration.Node` message. + +``` + node { + calculator: "TfLiteInferenceCalculator" + input_stream: "TENSORS:main_model_input" + output_stream: "TENSORS:main_model_output" + node_options: { + [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { + model_path: "mediapipe/models/detection_model.tflite" + } + } + } +``` + +The `node_options` field accepts the proto3 syntax. Alternatively, calculator +options can be specified in the `options` field using proto2 syntax. + +``` + node { + calculator: "TfLiteInferenceCalculator" + input_stream: "TENSORS:main_model_input" + output_stream: "TENSORS:main_model_output" + node_options: { + [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { + model_path: "mediapipe/models/detection_model.tflite" + } + } + } +``` + +Not all calculators accept calcuator options. In order to accept options, a +calculator will normally define a new protobuf message type to represent its +options, such as `PacketClonerCalculatorOptions`. The calculator will then +read that protobuf message in its `CalculatorBase::Open` method, and possibly +also in its `CalculatorBase::GetContract` function or its +`CalculatorBase::Process` method. Normally, the new protobuf message type will +be defined as a protobuf schema using a ".proto" file and a +`mediapipe_proto_library()` build rule. + +``` + mediapipe_proto_library( + name = "packet_cloner_calculator_proto", + srcs = ["packet_cloner_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], + ) +``` + + +## Example calculator + +This section discusses the implementation of `PacketClonerCalculator`, which +does a relatively simple job, and is used in many calculator graphs. +`PacketClonerCalculator` simply produces a copy of its most recent input packets +on demand. + +`PacketClonerCalculator` is useful when the timestamps of arriving data packets +are not aligned perfectly. Suppose we have a room with a microphone, light +sensor and a video camera that is collecting sensory data. Each of the sensors +operates independently and collects data intermittently. Suppose that the output +of each sensor is: + +* microphone = loudness in decibels of sound in the room (Integer) +* light sensor = brightness of room (Integer) +* video camera = RGB image frame of room (ImageFrame) + +Our simple perception pipeline is designed to process sensory data from these 3 +sensors such that at any time when we have image frame data from the camera that +is synchronized with the last collected microphone loudness data and light +sensor brightness data. To do this with MediaPipe, our perception pipeline has 3 +input streams: + +* room_mic_signal - Each packet of data in this input stream is integer data + representing how loud audio is in a room with timestamp. +* room_lightening_sensor - Each packet of data in this input stream is integer + data representing how bright is the room illuminated with timestamp. +* room_video_tick_signal - Each packet of data in this input stream is + imageframe of video data representing video collected from camera in the + room with timestamp. + +Below is the implementation of the `PacketClonerCalculator`. You can see the +`GetContract()`, `Open()`, and `Process()` methods as well as the instance +variable `current_` which holds the most recent input packets. + +```c++ +// This takes packets from N+1 streams, A_1, A_2, ..., A_N, B. +// For every packet that appears in B, outputs the most recent packet from each +// of the A_i on a separate stream. + +#include + +#include "absl/strings/str_cat.h" +#include "mediapipe/framework/calculator_framework.h" + +namespace mediapipe { + +// For every packet received on the last stream, output the latest packet +// obtained on all other streams. Therefore, if the last stream outputs at a +// higher rate than the others, this effectively clones the packets from the +// other streams to match the last. +// +// Example config: +// node { +// calculator: "PacketClonerCalculator" +// input_stream: "first_base_signal" +// input_stream: "second_base_signal" +// input_stream: "tick_signal" +// output_stream: "cloned_first_base_signal" +// output_stream: "cloned_second_base_signal" +// } +// +class PacketClonerCalculator : public CalculatorBase { + public: + static absl::Status GetContract(CalculatorContract* cc) { + const int tick_signal_index = cc->Inputs().NumEntries() - 1; + // cc->Inputs().NumEntries() returns the number of input streams + // for the PacketClonerCalculator + for (int i = 0; i < tick_signal_index; ++i) { + cc->Inputs().Index(i).SetAny(); + // cc->Inputs().Index(i) returns the input stream pointer by index + cc->Outputs().Index(i).SetSameAs(&cc->Inputs().Index(i)); + } + cc->Inputs().Index(tick_signal_index).SetAny(); + return absl::OkStatus(); + } + + absl::Status Open(CalculatorContext* cc) final { + tick_signal_index_ = cc->Inputs().NumEntries() - 1; + current_.resize(tick_signal_index_); + // Pass along the header for each stream if present. + for (int i = 0; i < tick_signal_index_; ++i) { + if (!cc->Inputs().Index(i).Header().IsEmpty()) { + cc->Outputs().Index(i).SetHeader(cc->Inputs().Index(i).Header()); + // Sets the output stream of index i header to be the same as + // the header for the input stream of index i + } + } + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) final { + // Store input signals. + for (int i = 0; i < tick_signal_index_; ++i) { + if (!cc->Inputs().Index(i).Value().IsEmpty()) { + current_[i] = cc->Inputs().Index(i).Value(); + } + } + + // Output if the tick signal is non-empty. + if (!cc->Inputs().Index(tick_signal_index_).Value().IsEmpty()) { + for (int i = 0; i < tick_signal_index_; ++i) { + if (!current_[i].IsEmpty()) { + cc->Outputs().Index(i).AddPacket( + current_[i].At(cc->InputTimestamp())); + // Add a packet to output stream of index i a packet from inputstream i + // with timestamp common to all present inputs + } else { + cc->Outputs().Index(i).SetNextTimestampBound( + cc->InputTimestamp().NextAllowedInStream()); + // if current_[i], 1 packet buffer for input stream i is empty, we will set + // next allowed timestamp for input stream i to be current timestamp + 1 + } + } + } + return absl::OkStatus(); + } + + private: + std::vector current_; + int tick_signal_index_; +}; + +REGISTER_CALCULATOR(PacketClonerCalculator); +} // namespace mediapipe +``` + +Typically, a calculator has only a .cc file. No .h is required, because +mediapipe uses registration to make calculators known to it. After you have +defined your calculator class, register it with a macro invocation +REGISTER_CALCULATOR(calculator_class_name). + +Below is a trivial MediaPipe graph that has 3 input streams, 1 node +(PacketClonerCalculator) and 2 output streams. + +```proto +input_stream: "room_mic_signal" +input_stream: "room_lighting_sensor" +input_stream: "room_video_tick_signal" + +node { + calculator: "PacketClonerCalculator" + input_stream: "room_mic_signal" + input_stream: "room_lighting_sensor" + input_stream: "room_video_tick_signal" + output_stream: "cloned_room_mic_signal" + output_stream: "cloned_lighting_sensor" + } +``` + +The diagram below shows how the `PacketClonerCalculator` defines its output +packets (bottom) based on its series of input packets (top). + +![Graph using PacketClonerCalculator](https://mediapipe.dev/images/packet_cloner_calculator.png) | +:--------------------------------------------------------------------------: | +*Each time it receives a packet on its TICK input stream, the PacketClonerCalculator outputs the most recent packet from each of its input streams. The sequence of output packets (bottom) is determined by the sequence of input packets (top) and their timestamps. The timestamps are shown along the right side of the diagram.* | diff --git a/docs/framework_concepts/framework_concepts.md b/docs/framework_concepts/framework_concepts.md new file mode 100644 index 0000000..004c75c --- /dev/null +++ b/docs/framework_concepts/framework_concepts.md @@ -0,0 +1,128 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/framework_concepts/overview +title: Framework Concepts +nav_order: 5 +has_children: true +has_toc: false +--- + +# Framework Concepts +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## The basics + +### Packet + +The basic data flow unit. A packet consists of a numeric timestamp and a shared +pointer to an **immutable** payload. The payload can be of any C++ type, and the +payload's type is also referred to as the type of the packet. Packets are value +classes and can be copied cheaply. Each copy shares ownership of the payload, +with reference-counting semantics. Each copy has its own timestamp. See also +[Packet](packets.md). + +### Graph + +MediaPipe processing takes place inside a graph, which defines packet flow paths +between **nodes**. A graph can have any number of inputs and outputs, and data +flow can branch and merge. Generally data flows forward, but backward loops are +possible. See [Graphs](graphs.md) for details. + +### Nodes + +Nodes produce and/or consume packets, and they are where the bulk of the graph’s +work takes place. They are also known as “calculators”, for historical reasons. +Each node’s interface defines a number of input and output **ports**, identified +by a tag and/or an index. See [Calculators](calculators.md) for details. + +### Streams + +A stream is a connection between two nodes that carries a sequence of packets, +whose timestamps must be monotonically increasing. + +### Side packets + +A side packet connection between nodes carries a single packet (with unspecified +timestamp). It can be used to provide some data that will remain constant, +whereas a stream represents a flow of data that changes over time. + +### Packet Ports + +A port has an associated type; packets transiting through the port must be of +that type. An output stream port can be connected to any number of input stream +ports of the same type; each consumer receives a separate copy of the output +packets, and has its own queue, so it can consume them at its own pace. +Similarly, a side packet output port can be connected to as many side packet +input ports as desired. + +A port can be required, meaning that a connection must be made for the graph to +be valid, or optional, meaning it may remain unconnected. + +Note: even if a stream connection is required, the stream may not carry a packet +for all timestamps. + +## Input and output + +Data flow can originate from **source nodes**, which have no input streams and +produce packets spontaneously (e.g. by reading from a file); or from **graph +input streams**, which let an application feed packets into a graph. + +Similarly, there are **sink nodes** that receive data and write it to various +destinations (e.g. a file, a memory buffer, etc.), and an application can also +receive output from the graph using **callbacks**. + +## Runtime behavior + +### Graph lifetime + +Once a graph has been initialized, it can be **started** to begin processing +data, and can process a stream of packets until each stream is closed or the +graph is **canceled**. Then the graph can be destroyed or **started** again. + +### Node lifetime + +There are three main lifetime methods the framework will call on a node: + +- Open: called once, before the other methods. When it is called, all input + side packets required by the node will be available. +- Process: called multiple times, when a new set of inputs is available, + according to the node’s input policy. +- Close: called once, at the end. + +In addition, each calculator can define constructor and destructor, which are +useful for creating and deallocating resources that are independent of the +processed data. + +### Input policies + +The default input policy is deterministic collation of packets by timestamp. A +node receives all inputs for the same timestamp at the same time, in an +invocation of its Process method; and successive input sets are received in +their timestamp order. This can require delaying the processing of some packets +until a packet with the same timestamp is received on all input streams, or +until it can be guaranteed that a packet with that timestamp will not be +arriving on the streams that have not received it. + +Other policies are also available, implemented using a separate kind of +component known as an InputStreamHandler. + +See [Synchronization](synchronization.md) for more details. + +### Real-time streams + +MediaPipe calculator graphs are often used to process streams of video or audio +frames for interactive applications. Normally, each Calculator runs as soon as +all of its input packets for a given timestamp become available. Calculators +used in real-time graphs need to define output timestamp bounds based on input +timestamp bounds in order to allow downstream calculators to be scheduled +promptly. See [Real-time Streams](realtime_streams.md) for details. diff --git a/docs/framework_concepts/gpu.md b/docs/framework_concepts/gpu.md new file mode 100644 index 0000000..8900ab3 --- /dev/null +++ b/docs/framework_concepts/gpu.md @@ -0,0 +1,170 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/framework_concepts/gpu +title: GPU +parent: Framework Concepts +nav_order: 5 +--- + +# GPU +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## Overview + +MediaPipe supports calculator nodes for GPU compute and rendering, and allows combining multiple GPU nodes, as well as mixing them with CPU based calculator nodes. There exist several GPU APIs on mobile platforms (eg, OpenGL ES, Metal and Vulkan). MediaPipe does not attempt to offer a single cross-API GPU abstraction. Individual nodes can be written using different APIs, allowing them to take advantage of platform specific features when needed. + +GPU support is essential for good performance on mobile platforms, especially for real-time video. MediaPipe enables developers to write GPU compatible calculators that support the use of GPU for: + + * On-device real-time processing, not just batch processing + * Video rendering and effects, not just analysis + +Below are the design principles for GPU support in MediaPipe + + * GPU-based calculators should be able to occur anywhere in the graph, and not necessarily be used for on-screen rendering. + * Transfer of frame data from one GPU-based calculator to another should be fast, and not incur expensive copy operations. + * Transfer of frame data between CPU and GPU should be as efficient as the platform allows. + * Because different platforms may require different techniques for best performance, the API should allow flexibility in the way things are implemented behind the scenes. + * A calculator should be allowed maximum flexibility in using the GPU for all or part of its operation, combining it with the CPU if necessary. + +## OpenGL ES Support + +MediaPipe supports OpenGL ES up to version 3.2 on Android/Linux and up to ES 3.0 +on iOS. In addition, MediaPipe also supports Metal on iOS. + +OpenGL ES 3.1 or greater is required (on Android/Linux systems) for running +machine learning inference calculators and graphs. + +MediaPipe allows graphs to run OpenGL in multiple GL contexts. For example, this +can be very useful in graphs that combine a slower GPU inference path (eg, at 10 +FPS) with a faster GPU rendering path (eg, at 30 FPS): since one GL context +corresponds to one sequential command queue, using the same context for both +tasks would reduce the rendering frame rate. + +One challenge MediaPipe's use of multiple contexts solves is the ability to +communicate across them. An example scenario is one with an input video that is +sent to both the rendering and inferences paths, and rendering needs to have +access to the latest output from inference. + +An OpenGL context cannot be accessed by multiple threads at the same time. +Furthermore, switching the active GL context on the same thread can be slow on +some Android devices. Therefore, our approach is to have one dedicated thread +per context. Each thread issues GL commands, building up a serial command queue +on its context, which is then executed by the GPU asynchronously. + +## Life of a GPU Calculator + +This section presents the basic structure of the Process method of a GPU +calculator derived from base class GlSimpleCalculator. The GPU calculator +`LuminanceCalculator` is shown as an example. The method +`LuminanceCalculator::GlRender` is called from `GlSimpleCalculator::Process`. + +```c++ +// Converts RGB images into luminance images, still stored in RGB format. +// See GlSimpleCalculator for inputs, outputs and input side packets. +class LuminanceCalculator : public GlSimpleCalculator { + public: + absl::Status GlSetup() override; + absl::Status GlRender(const GlTexture& src, + const GlTexture& dst) override; + absl::Status GlTeardown() override; + + private: + GLuint program_ = 0; + GLint frame_; +}; +REGISTER_CALCULATOR(LuminanceCalculator); + +absl::Status LuminanceCalculator::GlRender(const GlTexture& src, + const GlTexture& dst) { + static const GLfloat square_vertices[] = { + -1.0f, -1.0f, // bottom left + 1.0f, -1.0f, // bottom right + -1.0f, 1.0f, // top left + 1.0f, 1.0f, // top right + }; + static const GLfloat texture_vertices[] = { + 0.0f, 0.0f, // bottom left + 1.0f, 0.0f, // bottom right + 0.0f, 1.0f, // top left + 1.0f, 1.0f, // top right + }; + + // program + glUseProgram(program_); + glUniform1i(frame_, 1); + + // vertex storage + GLuint vbo[2]; + glGenBuffers(2, vbo); + GLuint vao; + glGenVertexArrays(1, &vao); + glBindVertexArray(vao); + + // vbo 0 + glBindBuffer(GL_ARRAY_BUFFER, vbo[0]); + glBufferData(GL_ARRAY_BUFFER, 4 * 2 * sizeof(GLfloat), square_vertices, + GL_STATIC_DRAW); + glEnableVertexAttribArray(ATTRIB_VERTEX); + glVertexAttribPointer(ATTRIB_VERTEX, 2, GL_FLOAT, 0, 0, nullptr); + + // vbo 1 + glBindBuffer(GL_ARRAY_BUFFER, vbo[1]); + glBufferData(GL_ARRAY_BUFFER, 4 * 2 * sizeof(GLfloat), texture_vertices, + GL_STATIC_DRAW); + glEnableVertexAttribArray(ATTRIB_TEXTURE_POSITION); + glVertexAttribPointer(ATTRIB_TEXTURE_POSITION, 2, GL_FLOAT, 0, 0, nullptr); + + // draw + glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); + + // cleanup + glDisableVertexAttribArray(ATTRIB_VERTEX); + glDisableVertexAttribArray(ATTRIB_TEXTURE_POSITION); + glBindBuffer(GL_ARRAY_BUFFER, 0); + glBindVertexArray(0); + glDeleteVertexArrays(1, &vao); + glDeleteBuffers(2, vbo); + + return absl::OkStatus(); +} +``` + +The design principles mentioned above have resulted in the following design +choices for MediaPipe GPU support: + + * We have a GPU data type, called `GpuBuffer`, for representing image data, optimized for GPU usage. The exact contents of this data type are opaque and platform-specific. + * A low-level API based on composition, where any calculator that wants to make use of the GPU creates and owns an instance of the `GlCalculatorHelper` class. This class offers a platform-agnostic API for managing the OpenGL context, setting up textures for inputs and outputs, etc. + * A high-level API based on subclassing, where simple calculators implementing image filters subclass from `GlSimpleCalculator` and only need to override a couple of virtual methods with their specific OpenGL code, while the superclass takes care of all the plumbing. + * Data that needs to be shared between all GPU-based calculators is provided as a external input that is implemented as a graph service and is managed by the `GlCalculatorHelper` class. + * The combination of calculator-specific helpers and a shared graph service allows us great flexibility in managing the GPU resource: we can have a separate context per calculator, share a single context, share a lock or other synchronization primitives, etc. -- and all of this is managed by the helper and hidden from the individual calculators. + +## GpuBuffer to ImageFrame Converters + +We provide two calculators called `GpuBufferToImageFrameCalculator` and `ImageFrameToGpuBufferCalculator`. These calculators convert between `ImageFrame` and `GpuBuffer`, allowing the construction of graphs that combine GPU and CPU calculators. They are supported on both iOS and Android + +When possible, these calculators use platform-specific functionality to share data between the CPU and the GPU without copying. + +The below diagram shows the data flow in a mobile application that captures video from the camera, runs it through a MediaPipe graph, and renders the output on the screen in real time. The dashed line indicates which parts are inside the MediaPipe graph proper. This application runs a Canny edge-detection filter on the CPU using OpenCV, and overlays it on top of the original video using the GPU. + +![How GPU calculators interact](https://mediapipe.dev/images/gpu_example_graph.png) + +Video frames from the camera are fed into the graph as `GpuBuffer` packets. The +input stream is accessed by two calculators in parallel. +`GpuBufferToImageFrameCalculator` converts the buffer into an `ImageFrame`, +which is then sent through a grayscale converter and a canny filter (both based +on OpenCV and running on the CPU), whose output is then converted into a +`GpuBuffer` again. A multi-input GPU calculator, GlOverlayCalculator, takes as +input both the original `GpuBuffer` and the one coming out of the edge detector, +and overlays them using a shader. The output is then sent back to the +application using a callback calculator, and the application renders the image +to the screen using OpenGL. diff --git a/docs/framework_concepts/graphs.md b/docs/framework_concepts/graphs.md new file mode 100644 index 0000000..5f9c68e --- /dev/null +++ b/docs/framework_concepts/graphs.md @@ -0,0 +1,399 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/framework_concepts/graphs +title: Graphs +parent: Framework Concepts +nav_order: 2 +--- + +# Graphs +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## Graph + +A `CalculatorGraphConfig` proto specifies the topology and functionality of a +MediaPipe graph. Each `node` in the graph represents a particular calculator or +subgraph, and specifies necessary configurations, such as registered +calculator/subgraph type, inputs, outputs and optional fields, such as +node-specific options, input policy and executor, discussed in +[Synchronization](synchronization.md). + +`CalculatorGraphConfig` has several other fields to configure global graph-level +settings, e.g. graph executor configs, number of threads, and maximum queue size +of input streams. Several graph-level settings are useful for tuning the +performance of the graph on different platforms (e.g., desktop v.s. mobile). For +instance, on mobile, attaching a heavy model-inference calculator to a separate +executor can improve the performance of a real-time application since this +enables thread locality. + +Below is a trivial `CalculatorGraphConfig` example where we have series of +passthrough calculators : + +```proto +# This graph named main_pass_throughcals_nosubgraph.pbtxt contains 4 +# passthrough calculators. +input_stream: "in" +output_stream: "out" +node { + calculator: "PassThroughCalculator" + input_stream: "in" + output_stream: "out1" +} +node { + calculator: "PassThroughCalculator" + input_stream: "out1" + output_stream: "out2" +} +node { + calculator: "PassThroughCalculator" + input_stream: "out2" + output_stream: "out3" +} +node { + calculator: "PassThroughCalculator" + input_stream: "out3" + output_stream: "out" +} +``` + +MediaPipe offers an alternative `C++` representation for complex graphs (e.g. ML pipelines, handling model metadata, optional nodes, etc.). The above graph may look like: + +```c++ +CalculatorGraphConfig BuildGraphConfig() { + Graph graph; + + // Graph inputs + Stream in = graph.In(0).SetName("in"); + + auto pass_through_fn = [](Stream in, + Graph& graph) -> Stream { + auto& node = graph.AddNode("PassThroughCalculator"); + in.ConnectTo(node.In(0)); + return node.Out(0); + }; + + Stream out1 = pass_through_fn(in, graph); + Stream out2 = pass_through_fn(out1, graph); + Stream out3 = pass_through_fn(out2, graph); + Stream out4 = pass_through_fn(out3, graph); + + // Graph outputs + out4.SetName("out").ConnectTo(graph.Out(0)); + + return graph.GetConfig(); +} +``` +See more details in [Building Graphs in C++](building_graphs_cpp.md) + +## Subgraph + +To modularize a `CalculatorGraphConfig` into sub-modules and assist with re-use +of perception solutions, a MediaPipe graph can be defined as a `Subgraph`. The +public interface of a subgraph consists of a set of input and output streams +similar to a calculator's public interface. The subgraph can then be included in +a `CalculatorGraphConfig` as if it were a calculator. When a MediaPipe graph is +loaded from a `CalculatorGraphConfig`, each subgraph node is replaced by the +corresponding graph of calculators. As a result, the semantics and performance +of the subgraph is identical to the corresponding graph of calculators. + +Below is an example of how to create a subgraph named `TwoPassThroughSubgraph`. + +1. Defining the subgraph. + + ```proto + # This subgraph is defined in two_pass_through_subgraph.pbtxt + # and is registered as "TwoPassThroughSubgraph" + + type: "TwoPassThroughSubgraph" + input_stream: "out1" + output_stream: "out3" + + node { + calculator: "PassThroughCalculator" + input_stream: "out1" + output_stream: "out2" + } + node { + calculator: "PassThroughCalculator" + input_stream: "out2" + output_stream: "out3" + } + ``` + + The public interface to the subgraph consists of: + + * Graph input streams + * Graph output streams + * Graph input side packets + * Graph output side packets + +2. Register the subgraph using BUILD rule `mediapipe_simple_subgraph`. The + parameter `register_as` defines the component name for the new subgraph. + + ```proto + # Small section of BUILD file for registering the "TwoPassThroughSubgraph" + # subgraph for use by main graph main_pass_throughcals.pbtxt + + mediapipe_simple_subgraph( + name = "twopassthrough_subgraph", + graph = "twopassthrough_subgraph.pbtxt", + register_as = "TwoPassThroughSubgraph", + deps = [ + "//mediapipe/calculators/core:pass_through_calculator", + "//mediapipe/framework:calculator_graph", + ], + ) + ``` + +3. Use the subgraph in the main graph. + + ```proto + # This main graph is defined in main_pass_throughcals.pbtxt + # using subgraph called "TwoPassThroughSubgraph" + + input_stream: "in" + node { + calculator: "PassThroughCalculator" + input_stream: "in" + output_stream: "out1" + } + node { + calculator: "TwoPassThroughSubgraph" + input_stream: "out1" + output_stream: "out3" + } + node { + calculator: "PassThroughCalculator" + input_stream: "out3" + output_stream: "out4" + } + ``` + +## Graph Options + +It is possible to specify a "graph options" protobuf for a MediaPipe graph +similar to the [`Calculator Options`](calculators.md#calculator-options) +protobuf specified for a MediaPipe calculator. These "graph options" can be +specified where a graph is invoked, and used to populate calculator options and +subgraph options within the graph. + +In a `CalculatorGraphConfig`, graph options can be specified for a subgraph +exactly like calculator options, as shown below: + +``` +node { + calculator: "FlowLimiterCalculator" + input_stream: "image" + output_stream: "throttled_image" + node_options: { + [type.googleapis.com/mediapipe.FlowLimiterCalculatorOptions] { + max_in_flight: 1 + } + } +} + +node { + calculator: "FaceDetectionSubgraph" + input_stream: "IMAGE:throttled_image" + node_options: { + [type.googleapis.com/mediapipe.FaceDetectionOptions] { + tensor_width: 192 + tensor_height: 192 + } + } +} +``` + +In a `CalculatorGraphConfig`, graph options can be accepted and used to populate +calculator options, as shown below: + +``` +graph_options: { + [type.googleapis.com/mediapipe.FaceDetectionOptions] {} +} + +node: { + calculator: "ImageToTensorCalculator" + input_stream: "IMAGE:image" + node_options: { + [type.googleapis.com/mediapipe.ImageToTensorCalculatorOptions] { + keep_aspect_ratio: true + border_mode: BORDER_ZERO + } + } + option_value: "output_tensor_width:options/tensor_width" + option_value: "output_tensor_height:options/tensor_height" +} + +node { + calculator: "InferenceCalculator" + node_options: { + [type.googleapis.com/mediapipe.InferenceCalculatorOptions] {} + } + option_value: "delegate:options/delegate" + option_value: "model_path:options/model_path" +} +``` + +In this example, the `FaceDetectionSubgraph` accepts graph option protobuf +`FaceDetectionOptions`. The `FaceDetectionOptions` is used to define some field +values in the calculator options `ImageToTensorCalculatorOptions` and some field +values in the subgraph options `InferenceCalculatorOptions`. The field values +are defined using the `option_value:` syntax. + +In the `CalculatorGraphConfig::Node` protobuf, the fields `node_options:` and +`option_value:` together define the option values for a calculator such as +`ImageToTensorCalculator`. The `node_options:` field defines a set of literal +constant values using the text protobuf syntax. Each `option_value:` field +defines the value for one protobuf field using information from the enclosing +graph, specifically from field values of the graph options of the enclosing +graph. In the example above, the `option_value:` +`"output_tensor_width:options/tensor_width"` defines the field +`ImageToTensorCalculatorOptions.output_tensor_width` using the value of +`FaceDetectionOptions.tensor_width`. + +The syntax of `option_value:` is similar to the syntax of `input_stream:`. The +syntax is `option_value: "LHS:RHS"`. The LHS identifies a calculator option +field and the RHS identifies a graph option field. More specifically, the LHS +and RHS each consists of a series of protobuf field names identifying nested +protobuf messages and fields separated by '/'. This is known as the "ProtoPath" +syntax. Nested messages that are referenced in the LHS or RHS must already be +defined in the enclosing protobuf in order to be traversed using +`option_value:`. + +## Cycles + + + +By default, MediaPipe requires calculator graphs to be acyclic and treats cycles +in a graph as errors. If a graph is intended to have cycles, the cycles need to +be annotated in the graph config. This page describes how to do that. + +NOTE: The current approach is experimental and subject to change. We welcome +your feedback. + +Please use the `CalculatorGraphTest.Cycle` unit test in +`mediapipe/framework/calculator_graph_test.cc` as sample code. Shown below is +the cyclic graph in the test. The `sum` output of the adder is the sum of the +integers generated by the integer source calculator. + +![a cyclic graph that adds a stream of integers](https://mediapipe.dev/images/cyclic_integer_sum_graph.svg "A cyclic graph") + +This simple graph illustrates all the issues in supporting cyclic graphs. + +### Back Edge Annotation + +We require that an edge in each cycle be annotated as a back edge. This allows +MediaPipe’s topological sort to work, after removing all the back edges. + +There are usually multiple ways to select the back edges. Which edges are marked +as back edges affects which nodes are considered as upstream and which nodes are +considered as downstream, which in turn affects the priorities MediaPipe assigns +to the nodes. + +For example, the `CalculatorGraphTest.Cycle` test marks the `old_sum` edge as a +back edge, so the Delay node is considered as a downstream node of the adder +node and is given a higher priority. Alternatively, we could mark the `sum` +input to the delay node as the back edge, in which case the delay node would be +considered as an upstream node of the adder node and is given a lower priority. + +### Initial Packet + +For the adder calculator to be runnable when the first integer from the integer +source arrives, we need an initial packet, with value 0 and with the same +timestamp, on the `old_sum` input stream to the adder. This initial packet +should be output by the delay calculator in the `Open()` method. + +### Delay in a Loop + +Each loop should incur a delay to align the previous `sum` output with the next +integer input. This is also done by the delay node. So the delay node needs to +know the following about the timestamps of the integer source calculator: + +* The timestamp of the first output. + +* The timestamp delta between successive outputs. + +We plan to add an alternative scheduling policy that only cares about packet +ordering and ignores packet timestamps, which will eliminate this inconvenience. + +### Early Termination of a Calculator When One Input Stream is Done + +By default, MediaPipe calls the `Close()` method of a non-source calculator when +all of its input streams are done. In the example graph, we want to stop the +adder node as soon as the integer source is done. This is accomplished by +configuring the adder node with an alternative input stream handler, +`EarlyCloseInputStreamHandler`. + +### Relevant Source Code + +#### Delay Calculator + +Note the code in `Open()` that outputs the initial packet and the code in +`Process()` that adds a (unit) delay to input packets. As noted above, this +delay node assumes that its output stream is used alongside an input stream with +packet timestamps 0, 1, 2, 3, ... + +```c++ +class UnitDelayCalculator : public Calculator { + public: +  static absl::Status FillExpectations( +      const CalculatorOptions& extendable_options, PacketTypeSet* inputs, +      PacketTypeSet* outputs, PacketTypeSet* input_side_packets) { +    inputs->Index(0)->Set("An integer."); +    outputs->Index(0)->Set("The input delayed by one time unit."); +    return absl::OkStatus(); +  } + +  absl::Status Open() final { +    Output()->Add(new int(0), Timestamp(0)); +    return absl::OkStatus(); +  } + +  absl::Status Process() final { +    const Packet& packet = Input()->Value(); +    Output()->AddPacket(packet.At(packet.Timestamp().NextAllowedInStream())); +    return absl::OkStatus(); +  } +}; +``` + +#### Graph Config + +Note the `back_edge` annotation and the alternative `input_stream_handler`. + +```proto +node { +  calculator: 'GlobalCountSourceCalculator' +  input_side_packet: 'global_counter' +  output_stream: 'integers' +} +node { +  calculator: 'IntAdderCalculator' +  input_stream: 'integers' +  input_stream: 'old_sum' +  input_stream_info: { +    tag_index: ':1' # 'old_sum' +    back_edge: true +  } +  output_stream: 'sum' +  input_stream_handler { +    input_stream_handler: 'EarlyCloseInputStreamHandler' +  } +} +node { +  calculator: 'UnitDelayCalculator' +  input_stream: 'sum' +  output_stream: 'old_sum' +} +``` diff --git a/docs/framework_concepts/packets.md b/docs/framework_concepts/packets.md new file mode 100644 index 0000000..1bfad37 --- /dev/null +++ b/docs/framework_concepts/packets.md @@ -0,0 +1,48 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/framework_concepts/packets +title: Packets +parent: Framework Concepts +nav_order: 3 +--- + +# Packets +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +Calculators communicate by sending and receiving packets. Typically a single +packet is sent along each input stream at each input timestamp. A packet can +contain any kind of data, such as a single frame of video or a single integer +detection count. + +## Creating a packet + +Packets are generally created with `mediapipe::MakePacket()` or +`mediapipe::Adopt()` (from packet.h). + +```c++ +// Create a packet containing some new data. +Packet p = MakePacket("constructor_argument"); +// Make a new packet with the same data and a different timestamp. +Packet p2 = p.At(Timestamp::PostStream()); +``` + +or: + +```c++ +// Create some new data. +auto data = absl::make_unique("constructor_argument"); +// Create a packet to own the data. +Packet p = Adopt(data.release()).At(Timestamp::PostStream()); +``` + +Data within a packet is accessed with `Packet::Get()` diff --git a/docs/framework_concepts/realtime_streams.md b/docs/framework_concepts/realtime_streams.md new file mode 100644 index 0000000..60f586c --- /dev/null +++ b/docs/framework_concepts/realtime_streams.md @@ -0,0 +1,193 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/framework_concepts/realtime_streams +title: Real-time Streams +parent: Framework Concepts +nav_order: 6 +--- + +# Real-time Streams +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## Real-time timestamps + +MediaPipe calculator graphs are often used to process streams of video or audio +frames for interactive applications. The MediaPipe framework requires only that +successive packets be assigned monotonically increasing timestamps. By +convention, real-time calculators and graphs use the recording time or the +presentation time of each frame as its timestamp, with each timestamp indicating +the microseconds since `Jan/1/1970:00:00:00`. This allows packets from various +sources to be processed in a globally consistent sequence. + +## Real-time scheduling + +Normally, each Calculator runs as soon as all of its input packets for a given +timestamp become available. Normally, this happens when the calculator has +finished processing the previous frame, and each of the calculators producing +its inputs have finished processing the current frame. The MediaPipe scheduler +invokes each calculator as soon as these conditions are met. See +[Synchronization](synchronization.md) for more details. + +## Timestamp bounds + +When a calculator does not produce any output packets for a given timestamp, it +can instead output a "timestamp bound" indicating that no packet will be +produced for that timestamp. This indication is necessary to allow downstream +calculators to run at that timestamp, even though no packet has arrived for +certain streams for that timestamp. This is especially important for real-time +graphs in interactive applications, where it is crucial that each calculator +begin processing as soon as possible. + +Consider a graph like the following: + +``` +node { + calculator: "A" + input_stream: "alpha_in" + output_stream: "alpha" +} +node { + calculator: "B" + input_stream: "alpha" + input_stream: "foo" + output_stream: "beta" +} +``` + +Suppose: at timestamp `T`, node `A` doesn't send a packet in its output stream +`alpha`. Node `B` gets a packet in `foo` at timestamp `T` and is waiting for a +packet in `alpha` at timestamp `T`. If `A` doesn't send `B` a timestamp bound +update for `alpha`, `B` will keep waiting for a packet to arrive in `alpha`. +Meanwhile, the packet queue of `foo` will accumulate packets at `T`, `T+1` and +so on. + +To output a packet on a stream, a calculator uses the API functions +`CalculatorContext::Outputs` and `OutputStream::Add`. To instead output a +timestamp bound on a stream, a calculator can use the API functions +`CalculatorContext::Outputs` and `CalculatorContext::SetNextTimestampBound`. The +specified bound is the lowest allowable timestamp for the next packet on the +specified output stream. When no packet is output, a calculator will typically +do something like: + +``` +cc->Outputs().Tag("output_frame").SetNextTimestampBound( + cc->InputTimestamp().NextAllowedInStream()); +``` + +The function `Timestamp::NextAllowedInStream` returns the successive timestamp. +For example, `Timestamp(1).NextAllowedInStream() == Timestamp(2)`. + +## Propagating timestamp bounds + +Calculators that will be used in real-time graphs need to define output +timestamp bounds based on input timestamp bounds in order to allow downstream +calculators to be scheduled promptly. A common pattern is for calculators to +output packets with the same timestamps as their input packets. In this case, +simply outputting a packet on every call to `Calculator::Process` is sufficient +to define output timestamp bounds. + +However, calculators are not required to follow this common pattern for output +timestamps, they are only required to choose monotonically increasing output +timestamps. As a result, certain calculators must calculate timestamp bounds +explicitly. MediaPipe provides several tools for computing appropriate timestamp +bound for each calculator. + +1\. **SetNextTimestampBound()** can be used to specify the timestamp bound, `t + +1`, for an output stream. + +``` +cc->Outputs.Tag("OUT").SetNextTimestampBound(t.NextAllowedInStream()); +``` + +Alternatively, an empty packet with timestamp `t` can be produced to specify the +timestamp bound `t + 1`. + +``` +cc->Outputs.Tag("OUT").Add(Packet(), t); +``` + +The timestamp bound of an input stream is indicated by the packet or the empty +packet on the input stream. + +``` +Timestamp bound = cc->Inputs().Tag("IN").Value().Timestamp(); +``` + +2\. **TimestampOffset()** can be specified in order to automatically copy the +timestamp bound from input streams to output streams. + +``` +cc->SetTimestampOffset(0); +``` + +This setting has the advantage of propagating timestamp bounds automatically, +even when only timestamp bounds arrive and Calculator::Process is not invoked. + +3\. **ProcessTimestampBounds()** can be specified in order to invoke +`Calculator::Process` for each new "settled timestamp", where the "settled +timestamp" is the new highest timestamp below the current timestamp bounds. +Without `ProcessTimestampBounds()`, `Calculator::Process` is invoked only with +one or more arriving packets. + +``` +cc->SetProcessTimestampBounds(true); +``` + +This setting allows a calculator to perform its own timestamp bounds calculation +and propagation, even when only input timestamps are updated. It can be used to +replicate the effect of `TimestampOffset()`, but it can also be used to +calculate a timestamp bound that takes into account additional factors. + +For example, in order to replicate `SetTimestampOffset(0)`, a calculator could +do the following: + +``` +absl::Status Open(CalculatorContext* cc) { + cc->SetProcessTimestampBounds(true); +} + +absl::Status Process(CalculatorContext* cc) { + cc->Outputs.Tag("OUT").SetNextTimestampBound( + cc->InputTimestamp().NextAllowedInStream()); +} +``` + +## Scheduling of Calculator::Open and Calculator::Close + +`Calculator::Open` is invoked when all required input side-packets have been +produced. Input side-packets can be provided by the enclosing application or by +"side-packet calculators" inside the graph. Side-packets can be specified from +outside the graph using the API's `CalculatorGraph::Initialize` and +`CalculatorGraph::StartRun`. Side packets can be specified by calculators within +the graph using `CalculatorGraphConfig::OutputSidePackets` and +`OutputSidePacket::Set`. + +Calculator::Close is invoked when all of the input streams have become `Done` by +being closed or reaching timestamp bound `Timestamp::Done`. + +**Note:** If the graph finishes all pending calculator execution and becomes +`Done`, before some streams become `Done`, then MediaPipe will invoke the +remaining calls to `Calculator::Close`, so that every calculator can produce its +final outputs. + +The use of `TimestampOffset` has some implications for `Calculator::Close`. A +calculator specifying `SetTimestampOffset(0)` will by design signal that all of +its output streams have reached `Timestamp::Done` when all of its input streams +have reached `Timestamp::Done`, and therefore no further outputs are possible. +This prevents such a calculator from emitting any packets during +`Calculator::Close`. If a calculator needs to produce a summary packet during +`Calculator::Close`, `Calculator::Process` must specify timestamp bounds such +that at least one timestamp (such as `Timestamp::Max`) remains available during +`Calculator::Close`. This means that such a calculator normally cannot rely upon +`SetTimestampOffset(0)` and must instead specify timestamp bounds explicitly +using `SetNextTimestampBounds()`. diff --git a/docs/framework_concepts/synchronization.md b/docs/framework_concepts/synchronization.md new file mode 100644 index 0000000..8a0a907 --- /dev/null +++ b/docs/framework_concepts/synchronization.md @@ -0,0 +1,182 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/framework_concepts/synchronization +title: Synchronization +parent: Framework Concepts +nav_order: 4 +--- + +# Synchronization +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## Scheduling mechanics + +Data processing in a MediaPipe graph occurs inside processing nodes defined as +[`CalculatorBase`] subclasses. The scheduling system decides when each +calculator should run. + +Each graph has at least one **scheduler queue**. Each scheduler queue has +exactly one **executor**. Nodes are statically assigned to a queue (and +therefore to an executor). By default there is one queue, whose executor is a +thread pool with a number of threads based on the system’s capabilities. + +Each node has a scheduling state, which can be *not ready*, *ready*, or +*running*. A readiness function determines whether a node is ready to run. This +function is invoked at graph initialization, whenever a node finishes running, +and whenever the state of a node’s inputs changes. + +The readiness function used depends on the type of node. A node with no stream +inputs is known as a **source node**; source nodes are always ready to run, +until they tell the framework they have no more data to output, at which point +they are closed. + +Non-source nodes are ready if they have inputs to process, and if those inputs +form a valid input set according to the conditions set by the node’s **input +policy** (discussed below). Most nodes use the default input policy, but some +nodes specify a different one. + +Note: Because changing the input policy changes the guarantees the calculator’s +code can expect from its inputs, it is not generally possible to mix and match +calculators with arbitrary input policies. Thus a calculator that uses a special +input policy should be written for it, and declare it in its contract. + +When a node becomes ready, a task is added to the corresponding scheduler queue, +which is a priority queue. The priority function is currently fixed, and takes +into account static properties of the nodes and their topological sorting within +the graph. For example, nodes closer to the output side of the graph have higher +priority, while source nodes have the lowest priority. + +Each queue is served by an executor, which is responsible for actually running +the task by invoking the calculator’s code. Different executors can be provided +and configured; this can be used to customize the use of execution resources, +e.g. by running certain nodes on lower-priority threads. + +## Timestamp Synchronization + +MediaPipe graph execution is decentralized: there is no global clock, and +different nodes can process data from different timestamps at the same time. +This allows higher throughput via pipelining. + +However, time information is very important for many perception workflows. Nodes +that receive multiple input streams generally need to coordinate them in some +way. For example, an object detector may output a list of boundary rectangles +from a frame, and this information may be fed into a rendering node, which +should process it together with the original frame. + +Therefore, one of the key responsibilities of the MediaPipe framework is to +provide input synchronization for nodes. In terms of framework mechanics, the +primary role of a timestamp is to serve as a **synchronization key**. + +Furthermore, MediaPipe is designed to support deterministic operations, which is +important in many scenarios (testing, simulation, batch processing, etc.), while +allowing graph authors to relax determinism where needed to meet real-time +constraints. + +The two objectives of synchronization and determinism underlie several design +choices. Notably, the packets pushed into a given stream must have monotonically +increasing timestamps: this is not just a useful assumption for many nodes, but +it is also relied upon by the synchronization logic. Each stream has a +**timestamp bound**, which is the lowest possible timestamp allowed for a new +packet on the stream. When a packet with timestamp `T` arrives, the bound +automatically advances to `T+1`, reflecting the monotonic requirement. This +allows the framework to know for certain that no more packets with timestamp +lower than `T` will arrive. + +## Input policies + +Synchronization is handled locally on each node, using the input policy +specified by the node. + +The default input policy, defined by [`DefaultInputStreamHandler`], provides +deterministic synchronization of inputs, with the following guarantees: + +* If packets with the same timestamp are provided on multiple input streams, + they will always be processed together regardless of their arrival order in + real time. + +* Input sets are processed in strictly ascending timestamp order. + +* No packets are dropped, and the processing is fully deterministic. + +* The node becomes ready to process data as soon as possible given the + guarantees above. + +Note: An important consequence of this is that if the calculator always uses the +current input timestamp when outputting packets, the output will inherently obey +the monotonically increasing timestamp requirement. + +Warning: On the other hand, it is not guaranteed that an input packet will +always be available for all streams. + +To explain how it works, we need to introduce the definition of a settled +timestamp. We say that a timestamp in a stream is *settled* if it is lower than +the timestamp bound. In other words, a timestamp is settled for a stream once +the state of the input at that timestamp is irrevocably known: either there is a +packet, or there is the certainty that a packet with that timestamp will not +arrive. + +Note: For this reason, MediaPipe also allows a stream producer to explicitly +advance the timestamp bound farther than what the last packet implies, i.e. to +provide a tighter bound. This can allow the downstream nodes to settle their +inputs sooner. + +A timestamp is settled across multiple streams if it is settled on each of those +streams. Furthermore, if a timestamp is settled it implies that all previous +timestamps are also settled. Thus settled timestamps can be processed +deterministically in ascending order. + +Given this definition, a calculator with the default input policy is ready if +there is a timestamp which is settled across all input streams and contains a +packet on at least one input stream. The input policy provides all available +packets for a settled timestamp as a single *input set* to the calculator. + +One consequence of this deterministic behavior is that, for nodes with multiple +input streams, there can be a theoretically unbounded wait for a timestamp to be +settled, and an unbounded number of packets can be buffered in the meantime. +(Consider a node with two input streams, one of which keeps sending packets +while the other sends nothing and does not advance the bound.) + +Therefore, we also provide for custom input policies: for example, splitting the +inputs in different synchronization sets defined by +[`SyncSetInputStreamHandler`], or avoiding synchronization altogether and +processing inputs immediately as they arrive defined by +[`ImmediateInputStreamHandler`]. + +## Flow control + +There are two main flow control mechanisms. A backpressure mechanism throttles +the execution of upstream nodes when the packets buffered on a stream reach a +(configurable) limit defined by [`CalculatorGraphConfig::max_queue_size`]. This +mechanism maintains deterministic behavior, and includes a deadlock avoidance +system that relaxes configured limits when needed. + +The second system consists of inserting special nodes which can drop packets +according to real-time constraints (typically using custom input policies) +defined by [`FlowLimiterCalculator`]. For example, a common pattern places a +flow-control node at the input of a subgraph, with a loopback connection from +the final output to the flow-control node. The flow-control node is thus able to +keep track of how many timestamps are being processed in the downstream graph, +and drop packets if this count hits a (configurable) limit; and since packets +are dropped upstream, we avoid the wasted work that would result from partially +processing a timestamp and then dropping packets between intermediate stages. + +This calculator-based approach gives the graph author control of where packets +can be dropped, and allows flexibility in adapting and customizing the graph’s +behavior depending on resource constraints. + +[`CalculatorBase`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator_base.h +[`DefaultInputStreamHandler`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/stream_handler/default_input_stream_handler.h +[`SyncSetInputStreamHandler`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/stream_handler/sync_set_input_stream_handler.cc +[`ImmediateInputStreamHandler`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/stream_handler/immediate_input_stream_handler.cc +[`CalculatorGraphConfig::max_queue_size`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator.proto +[`FlowLimiterCalculator`]: https://github.com/google/mediapipe/tree/master/mediapipe/calculators/core/flow_limiter_calculator.cc diff --git a/docs/getting_started/android.md b/docs/getting_started/android.md new file mode 100644 index 0000000..83fbd1c --- /dev/null +++ b/docs/getting_started/android.md @@ -0,0 +1,94 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/android +title: MediaPipe on Android +parent: Getting Started +has_children: true +has_toc: false +nav_order: 1 +--- + +# MediaPipe on Android +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +Please follow instructions below to build Android example apps in the supported +MediaPipe [solutions](../solutions/solutions.md). To learn more about these +example apps, start from [Hello World! on Android](./hello_world_android.md). + +To incorporate MediaPipe into Android Studio projects, see these +[instructions](./android_solutions.md) to use the MediaPipe Android Solution +APIs (currently in alpha) that are now available in +[Google's Maven Repository](https://maven.google.com/web/index.html?#com.google.mediapipe). + +## Building Android example apps with Bazel + +### Prerequisite + +* Install MediaPipe following these [instructions](./install.md). +* Setup Java Runtime. +* Setup Android SDK release 30.0.0 and above. +* Setup Android NDK version between 18 and 21. + +MediaPipe recommends setting up Android SDK and NDK via Android Studio (and see +below for Android Studio setup). However, if you prefer using MediaPipe without +Android Studio, please run +[`setup_android_sdk_and_ndk.sh`](https://github.com/google/mediapipe/blob/master/setup_android_sdk_and_ndk.sh) +to download and setup Android SDK and NDK before building any Android example +apps. + +If Android SDK and NDK are already installed (e.g., by Android Studio), set +$ANDROID_HOME and $ANDROID_NDK_HOME to point to the installed SDK and NDK. + +```bash +export ANDROID_HOME= +export ANDROID_NDK_HOME= +``` + +and add android_ndk_repository() and android_sdk_repository() rules into the +[`WORKSPACE`](https://github.com/google/mediapipe/blob/master/WORKSPACE) file as +the following: + +```bash +$ echo "android_sdk_repository(name = \"androidsdk\")" >> WORKSPACE +$ echo "android_ndk_repository(name = \"androidndk\", api_level=21)" >> WORKSPACE +``` + +In order to use MediaPipe on earlier Android versions, MediaPipe needs to switch +to a lower Android API level. You can achieve this by specifying `api_level = +$YOUR_INTENDED_API_LEVEL` in android_ndk_repository() and/or +android_sdk_repository() in the +[`WORKSPACE`](https://github.com/google/mediapipe/blob/master/WORKSPACE) file. + +Tip: You can run this +[script](https://github.com/google/mediapipe/blob/master/build_android_examples.sh) +to build (and install) all MediaPipe Android example apps. + +1. To build an Android example app, build against the corresponding + `android_binary` build target. For instance, for + [MediaPipe Hands](../solutions/hands.md) the target is `handtrackinggpu` in + the + [BUILD](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/BUILD) + file: + + Note: To reduce the binary size, consider appending `--linkopt="-s"` to the + command below to strip symbols. + + ```bash + bazel build -c opt --config=android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu:handtrackinggpu + ``` + +2. Install it on a device with: + + ```bash + adb install bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/handtrackinggpu.apk + ``` diff --git a/docs/getting_started/android_archive_library.md b/docs/getting_started/android_archive_library.md new file mode 100644 index 0000000..7d98b32 --- /dev/null +++ b/docs/getting_started/android_archive_library.md @@ -0,0 +1,165 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/android_archive_library +title: MediaPipe Android Archive +parent: MediaPipe on Android +grand_parent: Getting Started +nav_order: 3 +--- + +# MediaPipe Android Archive +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +***Experimental Only*** + +The MediaPipe Android Archive (AAR) library is a convenient way to use MediaPipe +with Android Studio and Gradle. MediaPipe doesn't publish a general AAR that can +be used by all projects. Instead, developers need to add a mediapipe_aar() +target to generate a custom AAR file for their own projects. This is necessary +in order to include specific resources such as MediaPipe calculators needed for +each project. + +## Steps to build a MediaPipe AAR + +1. Create a mediapipe_aar() target. + + In the MediaPipe directory, create a new mediapipe_aar() target in a BUILD + file. You need to figure out what calculators are used in the graph and + provide the calculator dependencies to the mediapipe_aar(). For example, to + build an AAR for [MediaPipe Face Detection](../solutions/face_detection.md), + you can put the following code into + mediapipe/examples/android/src/java/com/google/mediapipe/apps/aar_example/BUILD. + + ``` + load("//mediapipe/java/com/google/mediapipe:mediapipe_aar.bzl", "mediapipe_aar") + + mediapipe_aar( + name = "mediapipe_face_detection", + calculators = ["//mediapipe/graphs/face_detection:mobile_calculators"], + ) + ``` + +2. Run the Bazel build command to generate the AAR. + + ```bash + bazel build -c opt --strip=ALWAYS \ + --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ + --fat_apk_cpu=arm64-v8a,armeabi-v7a \ + --legacy_whole_archive=0 \ + --features=-legacy_whole_archive \ + --copt=-fvisibility=hidden \ + --copt=-ffunction-sections \ + --copt=-fdata-sections \ + --copt=-fstack-protector \ + --copt=-Oz \ + --copt=-fomit-frame-pointer \ + --copt=-DABSL_MIN_LOG_LEVEL=2 \ + --linkopt=-Wl,--gc-sections,--strip-all \ + //path/to/the/aar/build/file:aar_name.aar + ``` + + For the face detection AAR target we made in step 1, run: + + ```bash + bazel build -c opt --strip=ALWAYS \ + --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ + --fat_apk_cpu=arm64-v8a,armeabi-v7a \ + --legacy_whole_archive=0 \ + --features=-legacy_whole_archive \ + --copt=-fvisibility=hidden \ + --copt=-ffunction-sections \ + --copt=-fdata-sections \ + --copt=-fstack-protector \ + --copt=-Oz \ + --copt=-fomit-frame-pointer \ + --copt=-DABSL_MIN_LOG_LEVEL=2 \ + --linkopt=-Wl,--gc-sections,--strip-all \ + //mediapipe/examples/android/src/java/com/google/mediapipe/apps/aar_example:mediapipe_face_detection.aar + + # It should print: + # Target //mediapipe/examples/android/src/java/com/google/mediapipe/apps/aar_example:mediapipe_face_detection.aar up-to-date: + # bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/aar_example/mediapipe_face_detection.aar + ``` + +3. (Optional) Save the AAR to your preferred location. + + ```bash + cp bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/aar_example/mediapipe_face_detection.aar + /absolute/path/to/your/preferred/location + ``` + +## Steps to use a MediaPipe AAR in Android Studio with Gradle + +1. Start Android Studio and go to your project. + +2. Copy the AAR into app/libs. + + ```bash + cp bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/aar_example/mediapipe_face_detection.aar + /path/to/your/app/libs/ + ``` + + ![Screenshot](https://mediapipe.dev/images/mobile/aar_location.png) + +3. Make app/src/main/assets and copy assets (graph, model, and etc) into + app/src/main/assets. + + Build the MediaPipe binary graph and copy the assets into + app/src/main/assets, e.g., for the face detection graph, you need to build + and copy + [the binary graph](https://github.com/google/mediapipe/blob/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectiongpu/BUILD#L41) + and + [the face detection tflite model](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_detection/face_detection_short_range.tflite). + + ```bash + bazel build -c opt mediapipe/graphs/face_detection:face_detection_mobile_gpu_binary_graph + cp bazel-bin/mediapipe/graphs/face_detection/face_detection_mobile_gpu.binarypb /path/to/your/app/src/main/assets/ + cp mediapipe/modules/face_detection/face_detection_short_range.tflite /path/to/your/app/src/main/assets/ + ``` + + ![Screenshot](https://mediapipe.dev/images/mobile/assets_location.png) + +4. Modify app/build.gradle to add MediaPipe dependencies and MediaPipe AAR. + + ``` + dependencies { + implementation fileTree(dir: 'libs', include: ['*.jar', '*.aar']) + implementation 'androidx.appcompat:appcompat:1.0.2' + implementation 'androidx.constraintlayout:constraintlayout:1.1.3' + testImplementation 'junit:junit:4.12' + androidTestImplementation 'androidx.test.ext:junit:1.1.0' + androidTestImplementation 'androidx.test.espresso:espresso-core:3.1.1' + // MediaPipe deps + implementation 'com.google.flogger:flogger:latest.release' + implementation 'com.google.flogger:flogger-system-backend:latest.release' + implementation 'com.google.code.findbugs:jsr305:latest.release' + implementation 'com.google.guava:guava:27.0.1-android' + implementation 'com.google.protobuf:protobuf-javalite:3.19.1' + // CameraX core library + def camerax_version = "1.0.0-beta10" + implementation "androidx.camera:camera-core:$camerax_version" + implementation "androidx.camera:camera-camera2:$camerax_version" + implementation "androidx.camera:camera-lifecycle:$camerax_version" + // AutoValue + def auto_value_version = "1.8.1" + implementation "com.google.auto.value:auto-value-annotations:$auto_value_version" + annotationProcessor "com.google.auto.value:auto-value:$auto_value_version" + } + ``` + +5. Follow our Android app examples to use MediaPipe in Android Studio for your + use case. If you are looking for an example, a face detection example can be + found + [here](https://github.com/jiuqiant/mediapipe_face_detection_aar_example) and + a multi-hand tracking example can be found + [here](https://github.com/jiuqiant/mediapipe_multi_hands_tracking_aar_example). diff --git a/docs/getting_started/android_solutions.md b/docs/getting_started/android_solutions.md new file mode 100644 index 0000000..159d135 --- /dev/null +++ b/docs/getting_started/android_solutions.md @@ -0,0 +1,138 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/ +title: MediaPipe Android Solutions +parent: MediaPipe on Android +grand_parent: Getting Started +nav_order: 2 +--- + +# MediaPipe Android Solutions +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +MediaPipe Android Solution APIs (currently in alpha) are available in: + +* [MediaPipe Face Detection](../solutions/face_detection#android-solution-api) +* [MediaPipe Face Mesh](../solutions/face_mesh#android-solution-api) +* [MediaPipe Hands](../solutions/hands#android-solution-api) + +## Incorporation in Android Studio + +Prebuilt packages of Android Solution APIs can be found in +[Google's Maven Repository](https://maven.google.com/web/index.html?#com.google.mediapipe). +To incorporate them into an Android Studio project, add the following into the +project's Gradle dependencies: + +``` +dependencies { + // MediaPipe solution-core is the foundation of any MediaPipe Solutions. + implementation 'com.google.mediapipe:solution-core:latest.release' + // Optional: MediaPipe Face Detection Solution. + implementation 'com.google.mediapipe:facedetection:latest.release' + // Optional: MediaPipe Face Mesh Solution. + implementation 'com.google.mediapipe:facemesh:latest.release' + // Optional: MediaPipe Hands Solution. + implementation 'com.google.mediapipe:hands:latest.release' +} +``` + +If you need further customization, instead of using the prebuilt maven packages +consider building a MediaPipe Android Archive library locally from source by +following these [instructions](./android_archive_library.md). + +## Building solution example apps + +Detailed usage examples of the Android Solution APIs can be found in the +[source code](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/solutions) +of the solution example apps. + +To build these apps: + +1. Open Android Studio Arctic Fox on Linux, macOS, or Windows. + +2. Import mediapipe/examples/android/solutions directory into Android Studio. + + ![Screenshot](https://mediapipe.dev/images/import_mp_android_studio_project.png) + +3. For Windows users, run `create_win_symlinks.bat` as administrator to create + res directory symlinks. + + ![Screenshot](https://mediapipe.dev/images/run_create_win_symlinks.png) + +4. Select "File" -> "Sync Project with Gradle Files" to sync project. + +5. Run solution example app in Android Studio. + + ![Screenshot](https://mediapipe.dev/images/run_android_solution_app.png) + +6. (Optional) Run solutions on CPU. + + MediaPipe solution example apps run the pipeline and model inference on GPU + by default. If needed, for example to run the apps on Android Emulator, set + the `RUN_ON_GPU` boolean variable to `false` in the app's + `MainActivity.java` to run the pipeline and model inference on CPU. + +## MediaPipe Solution APIs Terms of Service + +Last modified: November 12, 2021 + +Use of MediaPipe Solution APIs is subject to the +[Google APIs Terms of Service](https://developers.google.com/terms), +[Google API Services User Data Policy](https://developers.google.com/terms/api-services-user-data-policy), +and the terms below. Please check back from time to time as these terms and +policies are occasionally updated. + +**Privacy** + +When you use MediaPipe Solution APIs, processing of the input data (e.g. images, +video, text) fully happens on-device, and **MediaPipe does not send that input +data to Google servers**. As a result, you can use our APIs for processing data +that should not leave the device. + +MediaPipe Android Solution APIs will contact Google servers from time to time in +order to receive things like bug fixes, updated models, and hardware accelerator +compatibility information. MediaPipe Android Solution APIs also send metrics +about the performance and utilization of the APIs in your app to Google. Google +uses this metrics data to measure performance, API usage, debug, maintain and +improve the APIs, and detect misuse or abuse, as further described in our +[Privacy Policy](https://policies.google.com/privacy). + +**You are responsible for obtaining informed consent from your app users about +Google’s processing of MediaPipe metrics data as required by applicable law.** + +Data we collect may include the following, across all MediaPipe Android Solution +APIs: + +- Device information (such as manufacturer, model, OS version and build) and + available ML hardware accelerators (GPU and DSP). Used for diagnostics and + usage analytics. + +- App identification information (package name / bundle id, app version). Used + for diagnostics and usage analytics. + +- API configuration (such as image format, resolution, and MediaPipe version + used). Used for diagnostics and usage analytics. + +- Event type (such as initialize, download model, update, run, and detection). + Used for diagnostics and usage analytics. + +- Error codes. Used for diagnostics. + +- Performance metrics. Used for diagnostics. + +- Per-installation identifiers that do not uniquely identify a user or + physical device. Used for operation of remote configuration and usage + analytics. + +- Network request sender IP addresses. Used for remote configuration + diagnostics. Collected IP addresses are retained temporarily. diff --git a/docs/getting_started/building_examples.md b/docs/getting_started/building_examples.md new file mode 100644 index 0000000..a77f6ea --- /dev/null +++ b/docs/getting_started/building_examples.md @@ -0,0 +1,40 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/ +title: Building MediaPipe Examples +parent: Getting Started +nav_exclude: true +--- + +# Building MediaPipe Examples +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +### Android + +Please see these [instructions](./android.md). + +### iOS + +Please see these [instructions](./ios.md). + +### Python + +Please see these [instructions](./python.md). + +### JavaScript + +Please see these [instructions](./javascript.md). + +### C++ + +Please see these [instructions](./cpp.md). diff --git a/docs/getting_started/cpp.md b/docs/getting_started/cpp.md new file mode 100644 index 0000000..d708866 --- /dev/null +++ b/docs/getting_started/cpp.md @@ -0,0 +1,69 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/cpp +title: MediaPipe in C++ +parent: Getting Started +has_children: true +has_toc: false +nav_order: 5 +--- + +# MediaPipe in C++ +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +Please follow instructions below to build C++ command-line example apps in the +supported MediaPipe [solutions](../solutions/solutions.md). To learn more about +these example apps, start from [Hello World! in C++](./hello_world_cpp.md). + +## Building C++ command-line example apps + +### Option 1: Running on CPU + +1. To build, for example, [MediaPipe Hands](../solutions/hands.md), run: + + ```bash + bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 mediapipe/examples/desktop/hand_tracking:hand_tracking_cpu + ``` + +2. To run the application: + + ```bash + GLOG_logtostderr=1 bazel-bin/mediapipe/examples/desktop/hand_tracking/hand_tracking_cpu \ + --calculator_graph_config_file=mediapipe/graphs/hand_tracking/hand_tracking_desktop_live.pbtxt + ``` + + This will open up your webcam as long as it is connected and on. Any errors + is likely due to your webcam being not accessible. + +### Option 2: Running on GPU + +Note: This currently works only on Linux, and please first follow +[OpenGL ES Setup on Linux Desktop](./gpu_support.md#opengl-es-setup-on-linux-desktop). + +1. To build, for example, [MediaPipe Hands](../solutions/hands.md), run: + + ```bash + bazel build -c opt --copt -DMESA_EGL_NO_X11_HEADERS --copt -DEGL_NO_X11 \ + mediapipe/examples/desktop/hand_tracking:hand_tracking_gpu + ``` + +2. To run the application: + + ```bash + GLOG_logtostderr=1 bazel-bin/mediapipe/examples/desktop/hand_tracking/hand_tracking_gpu \ + --calculator_graph_config_file=mediapipe/graphs/hand_tracking/hand_tracking_desktop_live_gpu.pbtxt + ``` + + This will open up your webcam as long as it is connected and on. Any errors + is likely due to your webcam being not accessible, or GPU drivers not setup + properly. diff --git a/docs/getting_started/faq.md b/docs/getting_started/faq.md new file mode 100644 index 0000000..db84fe2 --- /dev/null +++ b/docs/getting_started/faq.md @@ -0,0 +1,153 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/faq +title: FAQ +parent: Getting Started +nav_order: 9 +--- + +# FAQ +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +### How to convert ImageFrames and GpuBuffers + +The Calculators [`ImageFrameToGpuBufferCalculator`] and +[`GpuBufferToImageFrameCalculator`] convert back and forth between packets of +type [`ImageFrame`] and [`GpuBuffer`]. [`ImageFrame`] refers to image data in +CPU memory in any of a number of bitmap image formats. [`GpuBuffer`] refers to +image data in GPU memory. You can find more detail in the Framework Concepts +section +[GpuBuffer to ImageFrame Converters](./gpu.md#gpubuffer-to-imageframe-converters). +You can see an example in: + +* [`object_detection_mobile_cpu.pbtxt`] + +### How to visualize perception results + +The [`AnnotationOverlayCalculator`] allows perception results, such as bounding +boxes, arrows, and ovals, to be superimposed on the video frames aligned with +the recognized objects. The results can be displayed in a diagnostic window when +running on a workstation, or in a texture frame when running on device. You can +see an example use of [`AnnotationOverlayCalculator`] in: + +* [`face_detection_mobile_gpu.pbtxt`]. + +### How to run calculators in parallel + +Within a calculator graph, MediaPipe routinely runs separate calculator nodes +in parallel. MediaPipe maintains a pool of threads, and runs each calculator +as soon as a thread is available and all of it's inputs are ready. Each +calculator instance is only run for one set of inputs at a time, so most +calculators need only to be *thread-compatible* and not *thread-safe*. + +In order to enable one calculator to process multiple inputs in parallel, there +are two possible approaches: + +1. Define multiple calculator nodes and dispatch input packets to all nodes. +2. Make the calculator thread-safe and configure its [`max_in_flight`] setting. + +The first approach can be followed using the calculators designed to distribute +packets across other calculators, such as [`RoundRobinDemuxCalculator`]. A +single [`RoundRobinDemuxCalculator`] can distribute successive packets across +several identically configured [`ScaleImageCalculator`] nodes. + +The second approach allows up to [`max_in_flight`] invocations of the +[`CalculatorBase::Process`] method on the same calculator node. The output +packets from [`CalculatorBase::Process`] are automatically ordered by timestamp +before they are passed along to downstream calculators. + +With either approach, you must be aware that the calculator running in parallel +cannot maintain internal state in the same way as a normal sequential +calculator. + +### Output timestamps when using ImmediateInputStreamHandler + +The [`ImmediateInputStreamHandler`] delivers each packet as soon as it arrives +at an input stream. As a result, it can deliver a packet +with a higher timestamp from one input stream before delivering a packet with a +lower timestamp from a different input stream. If these input timestamps are +both used for packets sent to one output stream, that output stream will +complain that the timestamps are not monotonically increasing. In order to +remedy this, the calculator must take care to output a packet only after +processing is complete for its timestamp. This could be accomplished by waiting +until input packets have been received from all inputstreams for that timestamp, +or by ignoring a packet that arrives with a timestamp that has already been +processed. + +### How to change settings at runtime + +There are two main approaches to changing the settings of a calculator graph +while the application is running: + +1. Restart the calculator graph with modified [`CalculatorGraphConfig`]. +2. Send new calculator options through packets on graph input-streams. + +The first approach has the advantage of leveraging [`CalculatorGraphConfig`] +processing tools such as "subgraphs". The second approach has the advantage of +allowing active calculators and packets to remain in-flight while settings +change. MediaPipe contributors are currently investigating alternative approaches +to achieve both of these advantages. + +### How to process realtime input streams + +The MediaPipe framework can be used to process data streams either online or +offline. For offline processing, packets are pushed into the graph as soon as +calculators are ready to process those packets. For online processing, one +packet for each frame is pushed into the graph as that frame is recorded. + +The MediaPipe framework requires only that successive packets be assigned +monotonically increasing timestamps. By convention, realtime calculators and +graphs use the recording time or the presentation time as the timestamp for each +packet, with each timestamp representing microseconds since +`Jan/1/1970:00:00:00`. This allows packets from various sources to be processed +in a globally consistent order. + +Normally for offline processing, every input packet is processed and processing +continues as long as necessary. For online processing, it is often necessary to +drop input packets in order to keep pace with the arrival of input data frames. +When inputs arrive too frequently, the recommended technique for dropping +packets is to use the MediaPipe calculators designed specifically for this +purpose such as [`FlowLimiterCalculator`] and [`PacketClonerCalculator`]. + +For online processing, it is also necessary to promptly determine when processing +can proceed. MediaPipe supports this by propagating timestamp bounds between +calculators. Timestamp bounds indicate timestamp intervals that will contain no +input packets, and they allow calculators to begin processing for those +timestamps immediately. Calculators designed for realtime processing should +carefully calculate timestamp bounds in order to begin processing as promptly as +possible. For example, the [`MakePairCalculator`] uses the `SetOffset` API to +propagate timestamp bounds from input streams to output streams. + +### Can I run MediaPipe on MS Windows? + +Currently MediaPipe portability supports Debian Linux, Ubuntu Linux, +MacOS, Android, and iOS. The core of MediaPipe framework is a C++ library +conforming to the C++11 standard, so it is relatively easy to port to +additional platforms. + +[`object_detection_mobile_cpu.pbtxt`]: https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection/object_detection_mobile_cpu.pbtxt +[`ImageFrame`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/formats/image_frame.h +[`GpuBuffer`]: https://github.com/google/mediapipe/tree/master/mediapipe/gpu/gpu_buffer.h +[`GpuBufferToImageFrameCalculator`]: https://github.com/google/mediapipe/tree/master/mediapipe/gpu/gpu_buffer_to_image_frame_calculator.cc +[`ImageFrameToGpuBufferCalculator`]: https://github.com/google/mediapipe/tree/master/mediapipe/gpu/image_frame_to_gpu_buffer_calculator.cc +[`AnnotationOverlayCalculator`]: https://github.com/google/mediapipe/tree/master/mediapipe/calculators/util/annotation_overlay_calculator.cc +[`face_detection_mobile_gpu.pbtxt`]: https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt +[`CalculatorBase::Process`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator_base.h +[`max_in_flight`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator.proto +[`RoundRobinDemuxCalculator`]: https://github.com/google/mediapipe/tree/master//mediapipe/calculators/core/round_robin_demux_calculator.cc +[`ScaleImageCalculator`]: https://github.com/google/mediapipe/tree/master/mediapipe/calculators/image/scale_image_calculator.cc +[`ImmediateInputStreamHandler`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/stream_handler/immediate_input_stream_handler.cc +[`CalculatorGraphConfig`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator.proto +[`FlowLimiterCalculator`]: https://github.com/google/mediapipe/tree/master/mediapipe/calculators/core/flow_limiter_calculator.cc +[`PacketClonerCalculator`]: https://github.com/google/mediapipe/tree/master/mediapipe/calculators/core/packet_cloner_calculator.cc +[`MakePairCalculator`]: https://github.com/google/mediapipe/tree/master/mediapipe/calculators/core/make_pair_calculator.cc diff --git a/docs/getting_started/getting_started.md b/docs/getting_started/getting_started.md new file mode 100644 index 0000000..db605b4 --- /dev/null +++ b/docs/getting_started/getting_started.md @@ -0,0 +1,20 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/ +title: Getting Started +nav_order: 2 +has_children: true +--- + +# Getting Started +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- diff --git a/docs/getting_started/gpu_support.md b/docs/getting_started/gpu_support.md new file mode 100644 index 0000000..6c0e8be --- /dev/null +++ b/docs/getting_started/gpu_support.md @@ -0,0 +1,208 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/gpu_support +title: GPU Support +parent: Getting Started +nav_order: 7 +--- + +# GPU Support +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## OpenGL ES Support + +MediaPipe supports OpenGL ES up to version 3.2 on Android/Linux and up to ES 3.0 +on iOS. In addition, MediaPipe also supports Metal on iOS. + +OpenGL ES 3.1 or greater is required (on Android/Linux systems) for running +machine learning inference calculators and graphs. + +## Disable OpenGL ES Support + +By default, building MediaPipe (with no special bazel flags) attempts to compile +and link against OpenGL ES (and for iOS also Metal) libraries. + +On platforms where OpenGL ES is not available (see also +[OpenGL ES Setup on Linux Desktop](#opengl-es-setup-on-linux-desktop)), you +should disable OpenGL ES support with: + +``` +$ bazel build --define MEDIAPIPE_DISABLE_GPU=1 +``` + +Note: On Android and iOS, OpenGL ES is required by MediaPipe framework and the +support should never be disabled. + +## OpenGL ES Setup on Linux Desktop + +On Linux desktop with video cards that support OpenGL ES 3.1+, MediaPipe can run +GPU compute and rendering and perform TFLite inference on GPU. + +To check if your Linux desktop GPU can run MediaPipe with OpenGL ES: + +```bash +$ sudo apt-get install mesa-common-dev libegl1-mesa-dev libgles2-mesa-dev +$ sudo apt-get install mesa-utils +$ glxinfo | grep -i opengl +``` + +For example, it may print: + +```bash +$ glxinfo | grep -i opengl +... +OpenGL ES profile version string: OpenGL ES 3.2 NVIDIA 430.50 +OpenGL ES profile shading language version string: OpenGL ES GLSL ES 3.20 +OpenGL ES profile extensions: +``` + +If you have connected to your computer through SSH and find when you probe for +GPU information you see the output: + +```bash +glxinfo | grep -i opengl +Error: unable to open display +``` + +Try re-establishing your SSH connection with the `-X` option and try again. For +example: + +```bash +ssh -X @ +``` + +*Notice the ES 3.20 text above.* + +You need to see ES 3.1 or greater printed in order to perform TFLite inference +on GPU in MediaPipe. With this setup, build with: + +``` +$ bazel build --copt -DMESA_EGL_NO_X11_HEADERS --copt -DEGL_NO_X11 +``` + +If only ES 3.0 or below is supported, you can still build MediaPipe targets that +don't require TFLite inference on GPU with: + +``` +$ bazel build --copt -DMESA_EGL_NO_X11_HEADERS --copt -DEGL_NO_X11 --copt -DMEDIAPIPE_DISABLE_GL_COMPUTE +``` + +Note: MEDIAPIPE_DISABLE_GL_COMPUTE is already defined automatically on all Apple +systems (Apple doesn't support OpenGL ES 3.1+). + +## TensorFlow CUDA Support and Setup on Linux Desktop + +MediaPipe framework doesn't require CUDA for GPU compute and rendering. However, +MediaPipe can work with TensorFlow to perform GPU inference on video cards that +support CUDA. + +To enable TensorFlow GPU inference with MediaPipe, the first step is to follow +the +[TensorFlow GPU documentation](https://www.tensorflow.org/install/gpu#software_requirements) +to install the required NVIDIA software on your Linux desktop. + +After installation, update `$PATH` and `$LD_LIBRARY_PATH` and run `ldconfig` +with: + +``` +$ export PATH=/usr/local/cuda-10.1/bin${PATH:+:${PATH}} +$ export LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64,/usr/local/cuda-10.1/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} +$ sudo ldconfig +``` + +It's recommended to verify the installation of CUPTI, CUDA, CuDNN, and NVCC: + +``` +$ ls /usr/local/cuda/extras/CUPTI +/lib64 +libcupti.so libcupti.so.10.1.208 libnvperf_host.so libnvperf_target.so +libcupti.so.10.1 libcupti_static.a libnvperf_host_static.a + +$ ls /usr/local/cuda-10.1 +LICENSE bin extras lib64 libnvvp nvml samples src tools +README doc include libnsight nsightee_plugins nvvm share targets version.txt + +$ nvcc -V +nvcc: NVIDIA (R) Cuda compiler driver +Copyright (c) 2005-2019 NVIDIA Corporation +Built on Sun_Jul_28_19:07:16_PDT_2019 +Cuda compilation tools, release 10.1, V10.1.243 + +$ ls /usr/lib/x86_64-linux-gnu/ | grep libcudnn.so +libcudnn.so +libcudnn.so.7 +libcudnn.so.7.6.4 +``` + +Setting `$TF_CUDA_PATHS` is the way to declare where the CUDA library is. Note +that the following code snippet also adds `/usr/lib/x86_64-linux-gnu` and +`/usr/include` into `$TF_CUDA_PATHS` for cudablas and libcudnn. + +``` +$ export TF_CUDA_PATHS=/usr/local/cuda-10.1,/usr/lib/x86_64-linux-gnu,/usr/include +``` + +To make MediaPipe get TensorFlow's CUDA settings, find TensorFlow's +[.bazelrc](https://github.com/tensorflow/tensorflow/blob/master/.bazelrc) and +copy the `build:using_cuda` and `build:cuda` section into MediaPipe's .bazelrc +file. For example, as of April 23, 2020, TensorFlow's CUDA setting is the +following: + +``` +# This config refers to building with CUDA available. It does not necessarily +# mean that we build CUDA op kernels. +build:using_cuda --define=using_cuda=true +build:using_cuda --action_env TF_NEED_CUDA=1 +build:using_cuda --crosstool_top=@local_config_cuda//crosstool:toolchain + +# This config refers to building CUDA op kernels with nvcc. +build:cuda --config=using_cuda +build:cuda --define=using_cuda_nvcc=true +``` + +Finally, build MediaPipe with TensorFlow GPU with two more flags `--config=cuda` +and `--spawn_strategy=local`. For example: + +``` +$ bazel build -c opt --config=cuda --spawn_strategy=local \ + --define no_aws_support=true --copt -DMESA_EGL_NO_X11_HEADERS \ + mediapipe/examples/desktop/object_detection:object_detection_tensorflow +``` + +While the binary is running, it prints out the GPU device info: + +``` +I external/org_tensorflow/tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1 +I external/org_tensorflow/tensorflow/core/common_runtime/gpu/gpu_device.cc:1544] Found device 0 with properties: pciBusID: 0000:00:04.0 name: Tesla T4 computeCapability: 7.5 coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s +I external/org_tensorflow/tensorflow/core/common_runtime/gpu/gpu_device.cc:1686] Adding visible gpu devices: 0 +``` + +You can monitor the GPU usage to verify whether the GPU is used for model +inference. + +``` +$ nvidia-smi --query-gpu=utilization.gpu --format=csv --loop=1 + +0 % +0 % +4 % +5 % +83 % +21 % +22 % +27 % +29 % +100 % +0 % +0% +``` diff --git a/docs/getting_started/hello_world_android.md b/docs/getting_started/hello_world_android.md new file mode 100644 index 0000000..1148ff5 --- /dev/null +++ b/docs/getting_started/hello_world_android.md @@ -0,0 +1,785 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/hello_world_android +title: Hello World! on Android +parent: MediaPipe on Android +grand_parent: Getting Started +nav_order: 1 +--- + +# Hello World! on Android +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## Introduction + +This codelab uses MediaPipe on an Android device. + +### What you will learn + +How to develop an Android application that uses MediaPipe and run a MediaPipe +graph on Android. + +### What you will build + +A simple camera app for real-time Sobel edge detection applied to a live video +stream on an Android device. + +![edge_detection_android_gpu_gif](https://mediapipe.dev/images/mobile/edge_detection_android_gpu.gif) + +## Setup + +1. Install MediaPipe on your system, see + [MediaPipe installation guide](./install.md) for details. +2. Install Android Development SDK and Android NDK. See how to do so also in + [MediaPipe installation guide]. +3. Enable [developer options] on your Android device. +4. Setup [Bazel] on your system to build and deploy the Android app. + +## Graph for edge detection + +We will be using the following graph, [`edge_detection_mobile_gpu.pbtxt`]: + +``` +# MediaPipe graph that performs GPU Sobel edge detection on a live video stream. +# Used in the examples in +# mediapipe/examples/android/src/java/com/mediapipe/apps/basic and +# mediapipe/examples/ios/edgedetectiongpu. + +# Images coming into and out of the graph. +input_stream: "input_video" +output_stream: "output_video" + +# Converts RGB images into luminance images, still stored in RGB format. +node: { + calculator: "LuminanceCalculator" + input_stream: "input_video" + output_stream: "luma_video" +} + +# Applies the Sobel filter to luminance images stored in RGB format. +node: { + calculator: "SobelEdgesCalculator" + input_stream: "luma_video" + output_stream: "output_video" +} +``` + +A visualization of the graph is shown below: + +![edge_detection_mobile_gpu](https://mediapipe.dev/images/mobile/edge_detection_mobile_gpu.png) + +This graph has a single input stream named `input_video` for all incoming frames +that will be provided by your device's camera. + +The first node in the graph, `LuminanceCalculator`, takes a single packet (image +frame) and applies a change in luminance using an OpenGL shader. The resulting +image frame is sent to the `luma_video` output stream. + +The second node, `SobelEdgesCalculator` applies edge detection to incoming +packets in the `luma_video` stream and outputs results in `output_video` output +stream. + +Our Android application will display the output image frames of the +`output_video` stream. + +## Initial minimal application setup + +We first start with an simple Android application that displays "Hello World!" +on the screen. You may skip this step if you are familiar with building Android +applications using `bazel`. + +Create a new directory where you will create your Android application. For +example, the complete code of this tutorial can be found at +`mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic`. We +will refer to this path as `$APPLICATION_PATH` throughout the codelab. + +Note that in the path to the application: + +* The application is named `helloworld`. +* The `$PACKAGE_PATH` of the application is + `com.google.mediapipe.apps.basic`. This is used in code snippets in this + tutorial, so please remember to use your own `$PACKAGE_PATH` when you + copy/use the code snippets. + +Add a file `activity_main.xml` to `$APPLICATION_PATH/res/layout`. This displays +a [`TextView`] on the full screen of the application with the string `Hello +World!`: + +``` + + + + + + +``` + +Add a simple `MainActivity.java` to `$APPLICATION_PATH` which loads the content +of the `activity_main.xml` layout as shown below: + +``` +package com.google.mediapipe.apps.basic; + +import android.os.Bundle; +import androidx.appcompat.app.AppCompatActivity; + +/** Bare-bones main activity. */ +public class MainActivity extends AppCompatActivity { + + @Override + protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_main); + } +} +``` + +Add a manifest file, `AndroidManifest.xml` to `$APPLICATION_PATH`, which +launches `MainActivity` on application start: + +``` + + + + + + + + + + + + + + + +``` + +In our application we are using a `Theme.AppCompat` theme in the app, so we need +appropriate theme references. Add `colors.xml` to +`$APPLICATION_PATH/res/values/`: + +``` + + + #008577 + #00574B + #D81B60 + +``` + +Add `styles.xml` to `$APPLICATION_PATH/res/values/`: + +``` + + + + + + +``` + +To build the application, add a `BUILD` file to `$APPLICATION_PATH`, and +`${appName}` and `${mainActivity}` in the manifest will be replaced by strings +specified in `BUILD` as shown below. + +``` +android_library( + name = "basic_lib", + srcs = glob(["*.java"]), + manifest = "AndroidManifest.xml", + resource_files = glob(["res/**"]), + deps = [ + "//third_party:android_constraint_layout", + "//third_party:androidx_appcompat", + ], +) + +android_binary( + name = "helloworld", + manifest = "AndroidManifest.xml", + manifest_values = { + "applicationId": "com.google.mediapipe.apps.basic", + "appName": "Hello World", + "mainActivity": ".MainActivity", + }, + multidex = "native", + deps = [ + ":basic_lib", + ], +) +``` + +The `android_library` rule adds dependencies for `MainActivity`, resource files +and `AndroidManifest.xml`. + +The `android_binary` rule, uses the `basic_lib` Android library generated to +build a binary APK for installation on your Android device. + +To build the app, use the following command: + +``` +bazel build -c opt --config=android_arm64 $APPLICATION_PATH:helloworld +``` + +Install the generated APK file using `adb install`. For example: + +``` +adb install bazel-bin/$APPLICATION_PATH/helloworld.apk +``` + +Open the application on your device. It should display a screen with the text +`Hello World!`. + +![bazel_hello_world_android](https://mediapipe.dev/images/mobile/bazel_hello_world_android.png) + +## Using the camera via `CameraX` + +### Camera Permissions + +To use the camera in our application, we need to request the user to provide +access to the camera. To request camera permissions, add the following to +`AndroidManifest.xml`: + +``` + + + +``` + +Change the minimum SDK version to `21` and target SDK version to `27` in the +same file: + +``` + +``` + +This ensures that the user is prompted to request camera permission and enables +us to use the [CameraX] library for camera access. + +To request camera permissions, we can use a utility provided by MediaPipe +components, namely [`PermissionHelper`]. To use it, add a dependency +`"//mediapipe/java/com/google/mediapipe/components:android_components"` in the +`mediapipe_lib` rule in `BUILD`. + +To use the `PermissionHelper` in `MainActivity`, add the following line to the +`onCreate` function: + +``` +PermissionHelper.checkAndRequestCameraPermissions(this); +``` + +This prompts the user with a dialog on the screen to request for permissions to +use the camera in this application. + +Add the following code to handle the user response: + +``` +@Override +public void onRequestPermissionsResult( + int requestCode, String[] permissions, int[] grantResults) { + super.onRequestPermissionsResult(requestCode, permissions, grantResults); + PermissionHelper.onRequestPermissionsResult(requestCode, permissions, grantResults); +} + +@Override +protected void onResume() { + super.onResume(); + if (PermissionHelper.cameraPermissionsGranted(this)) { + startCamera(); + } +} + +public void startCamera() {} +``` + +We will leave the `startCamera()` method empty for now. When the user responds +to the prompt, the `MainActivity` will resume and `onResume()` will be called. +The code will confirm that permissions for using the camera have been granted, +and then will start the camera. + +Rebuild and install the application. You should now see a prompt requesting +access to the camera for the application. + +Note: If the there is no dialog prompt, uninstall and reinstall the application. +This may also happen if you haven't changed the `minSdkVersion` and +`targetSdkVersion` in the `AndroidManifest.xml` file. + +### Camera Access + +With camera permissions available, we can start and fetch frames from the +camera. + +To view the frames from the camera we will use a [`SurfaceView`]. Each frame +from the camera will be stored in a [`SurfaceTexture`] object. To use these, we +first need to change the layout of our application. + +Remove the entire [`TextView`] code block from +`$APPLICATION_PATH/res/layout/activity_main.xml` and add the following code +instead: + +``` + + + +``` + +This code block has a new [`FrameLayout`] named `preview_display_layout` and a +[`TextView`] nested inside it, named `no_camera_access_preview`. When camera +access permissions are not granted, our application will display the +[`TextView`] with a string message, stored in the variable `no_camera_access`. +Add the following line in the `$APPLICATION_PATH/res/values/strings.xml` file: + +``` +Please grant camera permissions. +``` + +When the user doesn't grant camera permission, the screen will now look like +this: + +![missing_camera_permission_android](https://mediapipe.dev/images/mobile/missing_camera_permission_android.png) + +Now, we will add the [`SurfaceTexture`] and [`SurfaceView`] objects to +`MainActivity`: + +``` +private SurfaceTexture previewFrameTexture; +private SurfaceView previewDisplayView; +``` + +In the `onCreate(Bundle)` function, add the following two lines _before_ +requesting camera permissions: + +``` +previewDisplayView = new SurfaceView(this); +setupPreviewDisplayView(); +``` + +And now add the code defining `setupPreviewDisplayView()`: + +``` +private void setupPreviewDisplayView() { + previewDisplayView.setVisibility(View.GONE); + ViewGroup viewGroup = findViewById(R.id.preview_display_layout); + viewGroup.addView(previewDisplayView); +} +``` + +We define a new [`SurfaceView`] object and add it to the +`preview_display_layout` [`FrameLayout`] object so that we can use it to display +the camera frames using a [`SurfaceTexture`] object named `previewFrameTexture`. + +To use `previewFrameTexture` for getting camera frames, we will use [CameraX]. +MediaPipe provides a utility named [`CameraXPreviewHelper`] to use [CameraX]. +This class updates a listener when camera is started via +`onCameraStarted(@Nullable SurfaceTexture)`. + +To use this utility, modify the `BUILD` file to add a dependency on +`"//mediapipe/java/com/google/mediapipe/components:android_camerax_helper"`. + +Now import [`CameraXPreviewHelper`] and add the following line to +`MainActivity`: + +``` +private CameraXPreviewHelper cameraHelper; +``` + +Now, we can add our implementation to `startCamera()`: + +``` +public void startCamera() { + cameraHelper = new CameraXPreviewHelper(); + cameraHelper.setOnCameraStartedListener( + surfaceTexture -> { + previewFrameTexture = surfaceTexture; + // Make the display view visible to start showing the preview. + previewDisplayView.setVisibility(View.VISIBLE); + }); +} +``` + +This creates a new [`CameraXPreviewHelper`] object and adds an anonymous +listener on the object. When `cameraHelper` signals that the camera has started +and a `surfaceTexture` to grab frames is available, we save that +`surfaceTexture` as `previewFrameTexture`, and make the `previewDisplayView` +visible so that we can start seeing frames from the `previewFrameTexture`. + +However, before starting the camera, we need to decide which camera we want to +use. [`CameraXPreviewHelper`] inherits from [`CameraHelper`] which provides two +options, `FRONT` and `BACK`. We can pass in the decision from the `BUILD` file +as metadata such that no code change is required to build another version of the +app using a different camera. + +Assuming we want to use `BACK` camera to perform edge detection on a live scene +that we view from the camera, add the metadata into `AndroidManifest.xml`: + +``` + ... + + + +``` + +and specify the selection in `BUILD` in the `helloworld` android binary rule +with a new entry in `manifest_values`: + +``` +manifest_values = { + "applicationId": "com.google.mediapipe.apps.basic", + "appName": "Hello World", + "mainActivity": ".MainActivity", + "cameraFacingFront": "False", +}, +``` + +Now, in `MainActivity` to retrieve the metadata specified in `manifest_values`, +add an [`ApplicationInfo`] object: + +``` +private ApplicationInfo applicationInfo; +``` + +In the `onCreate()` function, add: + +``` +try { + applicationInfo = + getPackageManager().getApplicationInfo(getPackageName(), PackageManager.GET_META_DATA); +} catch (NameNotFoundException e) { + Log.e(TAG, "Cannot find application info: " + e); +} +``` + +Now add the following line at the end of the `startCamera()` function: + +``` +CameraHelper.CameraFacing cameraFacing = + applicationInfo.metaData.getBoolean("cameraFacingFront", false) + ? CameraHelper.CameraFacing.FRONT + : CameraHelper.CameraFacing.BACK; +cameraHelper.startCamera(this, cameraFacing, /*unusedSurfaceTexture=*/ null); +``` + +At this point, the application should build successfully. However, when you run +the application on your device, you will see a black screen (even though camera +permissions have been granted). This is because even though we save the +`surfaceTexture` variable provided by the [`CameraXPreviewHelper`], the +`previewSurfaceView` doesn't use its output and display it on screen yet. + +Since we want to use the frames in a MediaPipe graph, we will not add code to +view the camera output directly in this tutorial. Instead, we skip ahead to how +we can send camera frames for processing to a MediaPipe graph and display the +output of the graph on the screen. + +## `ExternalTextureConverter` setup + +A [`SurfaceTexture`] captures image frames from a stream as an OpenGL ES +texture. To use a MediaPipe graph, frames captured from the camera should be +stored in a regular Open GL texture object. MediaPipe provides a class, +[`ExternalTextureConverter`] to convert the image stored in a [`SurfaceTexture`] +object to a regular OpenGL texture object. + +To use [`ExternalTextureConverter`], we also need an `EGLContext`, which is +created and managed by an [`EglManager`] object. Add a dependency to the `BUILD` +file to use [`EglManager`], `"//mediapipe/java/com/google/mediapipe/glutil"`. + +In `MainActivity`, add the following declarations: + +``` +private EglManager eglManager; +private ExternalTextureConverter converter; +``` + +In the `onCreate(Bundle)` function, add a statement to initialize the +`eglManager` object before requesting camera permissions: + +``` +eglManager = new EglManager(null); +``` + +Recall that we defined the `onResume()` function in `MainActivity` to confirm +camera permissions have been granted and call `startCamera()`. Before this +check, add the following line in `onResume()` to initialize the `converter` +object: + +``` +converter = new ExternalTextureConverter(eglManager.getContext()); +``` + +This `converter` now uses the `GLContext` managed by `eglManager`. + +We also need to override the `onPause()` function in the `MainActivity` so that +if the application goes into a paused state, we close the `converter` properly: + +``` +@Override +protected void onPause() { + super.onPause(); + converter.close(); +} +``` + +To pipe the output of `previewFrameTexture` to the `converter`, add the +following block of code to `setupPreviewDisplayView()`: + +``` +previewDisplayView + .getHolder() + .addCallback( + new SurfaceHolder.Callback() { + @Override + public void surfaceCreated(SurfaceHolder holder) {} + + @Override + public void surfaceChanged(SurfaceHolder holder, int format, int width, int height) { + // (Re-)Compute the ideal size of the camera-preview display (the area that the + // camera-preview frames get rendered onto, potentially with scaling and rotation) + // based on the size of the SurfaceView that contains the display. + Size viewSize = new Size(width, height); + Size displaySize = cameraHelper.computeDisplaySizeFromViewSize(viewSize); + + // Connect the converter to the camera-preview frames as its input (via + // previewFrameTexture), and configure the output width and height as the computed + // display size. + converter.setSurfaceTextureAndAttachToGLContext( + previewFrameTexture, displaySize.getWidth(), displaySize.getHeight()); + } + + @Override + public void surfaceDestroyed(SurfaceHolder holder) {} + }); +``` + +In this code block, we add a custom [`SurfaceHolder.Callback`] to +`previewDisplayView` and implement the `surfaceChanged(SurfaceHolder holder, int +format, int width, int height)` function to compute an appropriate display size +of the camera frames on the device screen and to tie the `previewFrameTexture` +object and send frames of the computed `displaySize` to the `converter`. + +We are now ready to use camera frames in a MediaPipe graph. + +## Using a MediaPipe graph in Android + +### Add relevant dependencies + +To use a MediaPipe graph, we need to add dependencies to the MediaPipe framework +on Android. We will first add a build rule to build a `cc_binary` using JNI code +of the MediaPipe framework and then build a `cc_library` rule to use this binary +in our application. Add the following code block to your `BUILD` file: + +``` +cc_binary( + name = "libmediapipe_jni.so", + linkshared = 1, + linkstatic = 1, + deps = [ + "//mediapipe/java/com/google/mediapipe/framework/jni:mediapipe_framework_jni", + ], +) + +cc_library( + name = "mediapipe_jni_lib", + srcs = [":libmediapipe_jni.so"], + alwayslink = 1, +) +``` + +Add the dependency `":mediapipe_jni_lib"` to the `mediapipe_lib` build rule in +the `BUILD` file. + +Next, we need to add dependencies specific to the MediaPipe graph we want to use +in the application. + +First, add dependencies to all calculator code in the `libmediapipe_jni.so` +build rule: + +``` +"//mediapipe/graphs/edge_detection:mobile_calculators", +``` + +MediaPipe graphs are `.pbtxt` files, but to use them in the application, we need +to use the `mediapipe_binary_graph` build rule to generate a `.binarypb` file. + +In the `helloworld` android binary build rule, add the `mediapipe_binary_graph` +target specific to the graph as an asset: + +``` +assets = [ + "//mediapipe/graphs/edge_detection:mobile_gpu_binary_graph", +], +assets_dir = "", +``` + +In the `assets` build rule, you can also add other assets such as TensorFlowLite +models used in your graph. + +In addition, add additional `manifest_values` for properties specific to the +graph, to be later retrieved in `MainActivity`: + +``` +manifest_values = { + "applicationId": "com.google.mediapipe.apps.basic", + "appName": "Hello World", + "mainActivity": ".MainActivity", + "cameraFacingFront": "False", + "binaryGraphName": "mobile_gpu.binarypb", + "inputVideoStreamName": "input_video", + "outputVideoStreamName": "output_video", +}, +``` + +Note that `binaryGraphName` indicates the filename of the binary graph, +determined by the `output_name` field in the `mediapipe_binary_graph` target. +`inputVideoStreamName` and `outputVideoStreamName` are the input and output +video stream name specified in the graph respectively. + +Now, the `MainActivity` needs to load the MediaPipe framework. Also, the +framework uses OpenCV, so `MainActvity` should also load `OpenCV`. Use the +following code in `MainActivity` (inside the class, but not inside any function) +to load both dependencies: + +``` +static { + // Load all native libraries needed by the app. + System.loadLibrary("mediapipe_jni"); + System.loadLibrary("opencv_java3"); +} +``` + +### Use the graph in `MainActivity` + +First, we need to load the asset which contains the `.binarypb` compiled from +the `.pbtxt` file of the graph. To do this, we can use a MediaPipe utility, +[`AndroidAssetUtil`]. + +Initialize the asset manager in `onCreate(Bundle)` before initializing +`eglManager`: + +``` +// Initialize asset manager so that MediaPipe native libraries can access the app assets, e.g., +// binary graphs. +AndroidAssetUtil.initializeNativeAssetManager(this); +``` + +Now, we need to setup a [`FrameProcessor`] object that sends camera frames +prepared by the `converter` to the MediaPipe graph and runs the graph, prepares +the output and then updates the `previewDisplayView` to display the output. Add +the following code to declare the `FrameProcessor`: + +``` +private FrameProcessor processor; +``` + +and initialize it in `onCreate(Bundle)` after initializing `eglManager`: + +``` +processor = + new FrameProcessor( + this, + eglManager.getNativeContext(), + applicationInfo.metaData.getString("binaryGraphName"), + applicationInfo.metaData.getString("inputVideoStreamName"), + applicationInfo.metaData.getString("outputVideoStreamName")); +``` + +The `processor` needs to consume the converted frames from the `converter` for +processing. Add the following line to `onResume()` after initializing the +`converter`: + +``` +converter.setConsumer(processor); +``` + +The `processor` should send its output to `previewDisplayView` To do this, add +the following function definitions to our custom [`SurfaceHolder.Callback`]: + +``` +@Override +public void surfaceCreated(SurfaceHolder holder) { + processor.getVideoSurfaceOutput().setSurface(holder.getSurface()); +} + +@Override +public void surfaceDestroyed(SurfaceHolder holder) { + processor.getVideoSurfaceOutput().setSurface(null); +} +``` + +When the `SurfaceHolder` is created, we had the `Surface` to the +`VideoSurfaceOutput` of the `processor`. When it is destroyed, we remove it from +the `VideoSurfaceOutput` of the `processor`. + +And that's it! You should now be able to successfully build and run the +application on the device and see Sobel edge detection running on a live camera +feed! Congrats! + +![edge_detection_android_gpu_gif](https://mediapipe.dev/images/mobile/edge_detection_android_gpu.gif) + +If you ran into any issues, please see the full code of the tutorial +[here](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic). + +[`ApplicationInfo`]:https://developer.android.com/reference/android/content/pm/ApplicationInfo +[`AndroidAssetUtil`]:https://github.com/google/mediapipe/tree/master/mediapipe/java/com/google/mediapipe/framework/AndroidAssetUtil.java +[Bazel]:https://bazel.build/ +[`CameraHelper`]:https://github.com/google/mediapipe/tree/master/mediapipe/java/com/google/mediapipe/components/CameraHelper.java +[CameraX]:https://developer.android.com/training/camerax +[`CameraXPreviewHelper`]:https://github.com/google/mediapipe/tree/master/mediapipe/java/com/google/mediapipe/components/CameraXPreviewHelper.java +[developer options]:https://developer.android.com/studio/debug/dev-options +[`edge_detection_mobile_gpu.pbtxt`]:https://github.com/google/mediapipe/tree/master/mediapipe/graphs/edge_detection/edge_detection_mobile_gpu.pbtxt +[`EglManager`]:https://github.com/google/mediapipe/tree/master/mediapipe/java/com/google/mediapipe/glutil/EglManager.java +[`ExternalTextureConverter`]:https://github.com/google/mediapipe/tree/master/mediapipe/java/com/google/mediapipe/components/ExternalTextureConverter.java +[`FrameLayout`]:https://developer.android.com/reference/android/widget/FrameLayout +[`FrameProcessor`]:https://github.com/google/mediapipe/tree/master/mediapipe/java/com/google/mediapipe/components/FrameProcessor.java +[`PermissionHelper`]: https://github.com/google/mediapipe/tree/master/mediapipe/java/com/google/mediapipe/components/PermissionHelper.java +[`SurfaceHolder.Callback`]:https://developer.android.com/reference/android/view/SurfaceHolder.Callback.html +[`SurfaceView`]:https://developer.android.com/reference/android/view/SurfaceView +[`SurfaceView`]:https://developer.android.com/reference/android/view/SurfaceView +[`SurfaceTexture`]:https://developer.android.com/reference/android/graphics/SurfaceTexture +[`TextView`]:https://developer.android.com/reference/android/widget/TextView diff --git a/docs/getting_started/hello_world_cpp.md b/docs/getting_started/hello_world_cpp.md new file mode 100644 index 0000000..f0c7ff0 --- /dev/null +++ b/docs/getting_started/hello_world_cpp.md @@ -0,0 +1,137 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/hello_world_cpp +title: Hello World! in C++ +parent: MediaPipe in C++ +grand_parent: Getting Started +nav_order: 1 +--- + +# Hello World! in C++ +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +1. Ensure you have a working version of MediaPipe. See + [installation instructions](./install.md). + +2. To run the [`hello world`] example: + + ```bash + $ git clone https://github.com/google/mediapipe.git + $ cd mediapipe + + $ export GLOG_logtostderr=1 + # Need bazel flag 'MEDIAPIPE_DISABLE_GPU=1' as desktop GPU is not supported currently. + $ bazel run --define MEDIAPIPE_DISABLE_GPU=1 \ + mediapipe/examples/desktop/hello_world:hello_world + + # It should print 10 rows of Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + ``` + +3. The [`hello world`] example uses a simple MediaPipe graph in the + `PrintHelloWorld()` function, defined in a [`CalculatorGraphConfig`] proto. + + ```c++ + absl::Status PrintHelloWorld() { + // Configures a simple graph, which concatenates 2 PassThroughCalculators. + CalculatorGraphConfig config = ParseTextProtoOrDie(R"( + input_stream: "in" + output_stream: "out" + node { + calculator: "PassThroughCalculator" + input_stream: "in" + output_stream: "out1" + } + node { + calculator: "PassThroughCalculator" + input_stream: "out1" + output_stream: "out" + } + )"); + ``` + + You can visualize this graph using + [MediaPipe Visualizer](https://viz.mediapipe.dev) by pasting the + CalculatorGraphConfig content below into the visualizer. See + [here](../tools/visualizer.md) for help on the visualizer. + + ```bash + input_stream: "in" + output_stream: "out" + node { + calculator: "PassThroughCalculator" + input_stream: "in" + output_stream: "out1" + } + node { + calculator: "PassThroughCalculator" + input_stream: "out1" + output_stream: "out" + } + ``` + + This graph consists of 1 graph input stream (`in`) and 1 graph output stream + (`out`), and 2 [`PassThroughCalculator`]s connected serially. + + ![hello_world graph](https://mediapipe.dev/images/hello_world.png) + +4. Before running the graph, an `OutputStreamPoller` object is connected to the + output stream in order to later retrieve the graph output, and a graph run + is started with [`StartRun`]. + + ```c++ + CalculatorGraph graph; + MP_RETURN_IF_ERROR(graph.Initialize(config)); + MP_ASSIGN_OR_RETURN(OutputStreamPoller poller, + graph.AddOutputStreamPoller("out")); + MP_RETURN_IF_ERROR(graph.StartRun({})); + ``` + +5. The example then creates 10 packets (each packet contains a string "Hello + World!" with Timestamp values ranging from 0, 1, ... 9) using the + [`MakePacket`] function, adds each packet into the graph through the `in` + input stream, and finally closes the input stream to finish the graph run. + + ```c++ + for (int i = 0; i < 10; ++i) { + MP_RETURN_IF_ERROR(graph.AddPacketToInputStream("in", + MakePacket("Hello World!").At(Timestamp(i)))); + } + MP_RETURN_IF_ERROR(graph.CloseInputStream("in")); + ``` + +6. Through the `OutputStreamPoller` object the example then retrieves all 10 + packets from the output stream, gets the string content out of each packet + and prints it to the output log. + + ```c++ + mediapipe::Packet packet; + while (poller.Next(&packet)) { + ABSL_LOG(INFO) << packet.Get(); + } + ``` + +[`hello world`]: https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/hello_world/hello_world.cc +[`CalculatorGraphConfig`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator.proto +[`PassThroughCalculator`]: https://github.com/google/mediapipe/tree/master/mediapipe/calculators/core/pass_through_calculator.cc +[`MakePacket`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/packet.h +[`StartRun`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator_graph.h diff --git a/docs/getting_started/hello_world_ios.md b/docs/getting_started/hello_world_ios.md new file mode 100644 index 0000000..118b9a0 --- /dev/null +++ b/docs/getting_started/hello_world_ios.md @@ -0,0 +1,599 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/hello_world_ios +title: Hello World! on iOS +parent: MediaPipe on iOS +grand_parent: Getting Started +nav_order: 1 +--- + +# Hello World! on iOS +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## Introduction + +This codelab uses MediaPipe on an iOS device. + +### What you will learn + +How to develop an iOS application that uses MediaPipe and run a MediaPipe graph +on iOS. + +### What you will build + +A simple camera app for real-time Sobel edge detection applied to a live video +stream on an iOS device. + +![edge_detection_ios_gpu_gif](https://mediapipe.dev/images/mobile/edge_detection_ios_gpu.gif) + +## Setup + +1. Install MediaPipe on your system, see + [MediaPipe installation guide](./install.md) for details. +2. Setup your iOS device for development. +3. Setup [Bazel] on your system to build and deploy the iOS app. + +## Graph for edge detection + +We will be using the following graph, [`edge_detection_mobile_gpu.pbtxt`]: + +``` +# MediaPipe graph that performs GPU Sobel edge detection on a live video stream. +# Used in the examples +# mediapipe/examples/android/src/java/com/google/mediapipe/apps/basic:helloworld +# and mediapipe/examples/ios/helloworld. + +# Images coming into and out of the graph. +input_stream: "input_video" +output_stream: "output_video" + +# Converts RGB images into luminance images, still stored in RGB format. +node: { + calculator: "LuminanceCalculator" + input_stream: "input_video" + output_stream: "luma_video" +} + +# Applies the Sobel filter to luminance images stored in RGB format. +node: { + calculator: "SobelEdgesCalculator" + input_stream: "luma_video" + output_stream: "output_video" +} +``` + +A visualization of the graph is shown below: + +![edge_detection_mobile_gpu](https://mediapipe.dev/images/mobile/edge_detection_mobile_gpu.png) + +This graph has a single input stream named `input_video` for all incoming frames +that will be provided by your device's camera. + +The first node in the graph, `LuminanceCalculator`, takes a single packet (image +frame) and applies a change in luminance using an OpenGL shader. The resulting +image frame is sent to the `luma_video` output stream. + +The second node, `SobelEdgesCalculator` applies edge detection to incoming +packets in the `luma_video` stream and outputs results in `output_video` output +stream. + +Our iOS application will display the output image frames of the `output_video` +stream. + +## Initial minimal application setup + +We first start with a simple iOS application and demonstrate how to use `bazel` +to build it. + +First, create an XCode project via File > New > Single View App. + +Set the product name to "HelloWorld", and use an appropriate organization +identifier, such as `com.google.mediapipe`. The organization identifier +alongwith the product name will be the `bundle_id` for the application, such as +`com.google.mediapipe.HelloWorld`. + +Set the language to Objective-C. + +Save the project to an appropriate location. Let's call this +`$PROJECT_TEMPLATE_LOC`. So your project will be in the +`$PROJECT_TEMPLATE_LOC/HelloWorld` directory. This directory will contain +another directory named `HelloWorld` and an `HelloWorld.xcodeproj` file. + +The `HelloWorld.xcodeproj` will not be useful for this tutorial, as we will use +bazel to build the iOS application. The content of the +`$PROJECT_TEMPLATE_LOC/HelloWorld/HelloWorld` directory is listed below: + +1. `AppDelegate.h` and `AppDelegate.m` +2. `ViewController.h` and `ViewController.m` +3. `main.m` +4. `Info.plist` +5. `Main.storyboard` and `Launch.storyboard` +6. `Assets.xcassets` directory. + +Note: In newer versions of Xcode, you may see additional files `SceneDelegate.h` +and `SceneDelegate.m`. Make sure to copy them too and add them to the `BUILD` +file mentioned below. + +Copy these files to a directory named `HelloWorld` to a location that can access +the MediaPipe source code. For example, the source code of the application that +we will build in this tutorial is located in +`mediapipe/examples/ios/HelloWorld`. We will refer to this path as the +`$APPLICATION_PATH` throughout the codelab. + +Note: MediaPipe provides Objective-C bindings for iOS. The edge detection +application in this tutorial and all iOS examples using MediaPipe use +Objective-C with C++ in `.mm` files. + +Create a `BUILD` file in the `$APPLICATION_PATH` and add the following build +rules: + +``` +MIN_IOS_VERSION = "12.0" + +load( + "@build_bazel_rules_apple//apple:ios.bzl", + "ios_application", +) + +ios_application( + name = "HelloWorldApp", + bundle_id = "com.google.mediapipe.HelloWorld", + families = [ + "iphone", + "ipad", + ], + infoplists = ["Info.plist"], + minimum_os_version = MIN_IOS_VERSION, + provisioning_profile = "//mediapipe/examples/ios:developer_provisioning_profile", + deps = [":HelloWorldAppLibrary"], +) + +objc_library( + name = "HelloWorldAppLibrary", + srcs = [ + "AppDelegate.m", + "ViewController.m", + "main.m", + ], + hdrs = [ + "AppDelegate.h", + "ViewController.h", + ], + data = [ + "Base.lproj/LaunchScreen.storyboard", + "Base.lproj/Main.storyboard", + ], + sdk_frameworks = [ + "UIKit", + ], + deps = [], +) +``` + +The `objc_library` rule adds dependencies for the `AppDelegate` and +`ViewController` classes, `main.m` and the application storyboards. The +templated app depends only on the `UIKit` SDK. + +The `ios_application` rule uses the `HelloWorldAppLibrary` Objective-C library +generated to build an iOS application for installation on your iOS device. + +Note: You need to point to your own iOS developer provisioning profile to be +able to run the application on your iOS device. + +To build the app, use the following command in a terminal: + +``` +bazel build -c opt --config=ios_arm64 <$APPLICATION_PATH>:HelloWorldApp' +``` + +For example, to build the `HelloWorldApp` application in +`mediapipe/examples/ios/helloworld`, use the following command: + +``` +bazel build -c opt --config=ios_arm64 mediapipe/examples/ios/helloworld:HelloWorldApp +``` + +Then, go back to XCode, open Window > Devices and Simulators, select your +device, and add the `.ipa` file generated by the command above to your device. +Here is the document on [setting up and compiling](./ios.md) iOS MediaPipe apps. + +Open the application on your device. Since it is empty, it should display a +blank white screen. + +## Use the camera for the live view feed + +In this tutorial, we will use the `MPPCameraInputSource` class to access and +grab frames from the camera. This class uses the `AVCaptureSession` API to get +the frames from the camera. + +But before using this class, change the `Info.plist` file to support camera +usage in the app. + +In `ViewController.m`, add the following import line: + +``` +#import "mediapipe/objc/MPPCameraInputSource.h" +``` + +Add the following to its implementation block to create an object +`_cameraSource`: + +``` +@implementation ViewController { + // Handles camera access via AVCaptureSession library. + MPPCameraInputSource* _cameraSource; +} +``` + +Add the following code to `viewDidLoad()`: + +``` +-(void)viewDidLoad { + [super viewDidLoad]; + + _cameraSource = [[MPPCameraInputSource alloc] init]; + _cameraSource.sessionPreset = AVCaptureSessionPresetHigh; + _cameraSource.cameraPosition = AVCaptureDevicePositionBack; + // The frame's native format is rotated with respect to the portrait orientation. + _cameraSource.orientation = AVCaptureVideoOrientationPortrait; +} +``` + +The code initializes `_cameraSource`, sets the capture session preset, and which +camera to use. + +We need to get frames from the `_cameraSource` into our application +`ViewController` to display them. `MPPCameraInputSource` is a subclass of +`MPPInputSource`, which provides a protocol for its delegates, namely the +`MPPInputSourceDelegate`. So our application `ViewController` can be a delegate +of `_cameraSource`. + +Update the interface definition of `ViewController` accordingly: + +``` +@interface ViewController () +``` + +To handle camera setup and process incoming frames, we should use a queue +different from the main queue. Add the following to the implementation block of +the `ViewController`: + +``` +// Process camera frames on this queue. +dispatch_queue_t _videoQueue; +``` + +In `viewDidLoad()`, add the following line after initializing the +`_cameraSource` object: + +``` +[_cameraSource setDelegate:self queue:_videoQueue]; +``` + +And add the following code to initialize the queue before setting up the +`_cameraSource` object: + +``` +dispatch_queue_attr_t qosAttribute = dispatch_queue_attr_make_with_qos_class( + DISPATCH_QUEUE_SERIAL, QOS_CLASS_USER_INTERACTIVE, /*relative_priority=*/0); +_videoQueue = dispatch_queue_create(kVideoQueueLabel, qosAttribute); +``` + +We will use a serial queue with the priority `QOS_CLASS_USER_INTERACTIVE` for +processing camera frames. + +Add the following line after the header imports at the top of the file, before +the interface/implementation of the `ViewController`: + +``` +static const char* kVideoQueueLabel = "com.google.mediapipe.example.videoQueue"; +``` + +Before implementing any method from `MPPInputSourceDelegate` protocol, we must +first set up a way to display the camera frames. MediaPipe provides another +utility called `MPPLayerRenderer` to display images on the screen. This utility +can be used to display `CVPixelBufferRef` objects, which is the type of the +images provided by `MPPCameraInputSource` to its delegates. + +In `ViewController.m`, add the following import line: + +``` +#import "mediapipe/objc/MPPLayerRenderer.h" +``` + +To display images of the screen, we need to add a new `UIView` object called +`_liveView` to the `ViewController`. + +Add the following lines to the implementation block of the `ViewController`: + +``` +// Display the camera preview frames. +IBOutlet UIView* _liveView; +// Render frames in a layer. +MPPLayerRenderer* _renderer; +``` + +Go to `Main.storyboard`, add a `UIView` object from the object library to the +`View` of the `ViewController` class. Add a referencing outlet from this view to +the `_liveView` object you just added to the `ViewController` class. Resize the +view so that it is centered and covers the entire application screen. + +Go back to `ViewController.m` and add the following code to `viewDidLoad()` to +initialize the `_renderer` object: + +``` +_renderer = [[MPPLayerRenderer alloc] init]; +_renderer.layer.frame = _liveView.layer.bounds; +[_liveView.layer addSublayer:_renderer.layer]; +_renderer.frameScaleMode = MPPFrameScaleModeFillAndCrop; +``` + +To get frames from the camera, we will implement the following method: + +``` +// Must be invoked on _videoQueue. +- (void)processVideoFrame:(CVPixelBufferRef)imageBuffer + timestamp:(CMTime)timestamp + fromSource:(MPPInputSource*)source { + if (source != _cameraSource) { + NSLog(@"Unknown source: %@", source); + return; + } + // Display the captured image on the screen. + CFRetain(imageBuffer); + dispatch_async(dispatch_get_main_queue(), ^{ + [_renderer renderPixelBuffer:imageBuffer]; + CFRelease(imageBuffer); + }); +} +``` + +This is a delegate method of `MPPInputSource`. We first check that we are +getting frames from the right source, i.e. the `_cameraSource`. Then we display +the frame received from the camera via `_renderer` on the main queue. + +Now, we need to start the camera as soon as the view to display the frames is +about to appear. To do this, we will implement the +`viewWillAppear:(BOOL)animated` function: + +``` +-(void)viewWillAppear:(BOOL)animated { + [super viewWillAppear:animated]; +} +``` + +Before we start running the camera, we need the user's permission to access it. +`MPPCameraInputSource` provides a function +`requestCameraAccessWithCompletionHandler:(void (^_Nullable)(BOOL +granted))handler` to request camera access and do some work when the user has +responded. Add the following code to `viewWillAppear:animated`: + +``` +[_cameraSource requestCameraAccessWithCompletionHandler:^void(BOOL granted) { + if (granted) { + dispatch_async(_videoQueue, ^{ + [_cameraSource start]; + }); + } +}]; +``` + +Before building the application, add the following dependencies to your `BUILD` +file: + +``` +sdk_frameworks = [ + "AVFoundation", + "CoreGraphics", + "CoreMedia", +], +deps = [ + "//mediapipe/objc:mediapipe_framework_ios", + "//mediapipe/objc:mediapipe_input_sources_ios", + "//mediapipe/objc:mediapipe_layer_renderer", +], +``` + +Now build and run the application on your iOS device. You should see a live +camera view feed after accepting camera permissions. + +We are now ready to use camera frames in a MediaPipe graph. + +## Using a MediaPipe graph in iOS + +### Add relevant dependencies + +We already added the dependencies of the MediaPipe framework code which contains +the iOS API to use a MediaPipe graph. To use a MediaPipe graph, we need to add a +dependency on the graph we intend to use in our application. Add the following +line to the `data` list in your `BUILD` file: + +``` +"//mediapipe/graphs/edge_detection:mobile_gpu_binary_graph", +``` + +Now add the dependency to the calculators used in this graph in the `deps` field +in the `BUILD` file: + +``` +"//mediapipe/graphs/edge_detection:mobile_calculators", +``` + +Finally, rename the file `ViewController.m` to `ViewController.mm` to support +Objective-C++. + +### Use the graph in `ViewController` + +In `ViewController.m`, add the following import line: + +``` +#import "mediapipe/objc/MPPGraph.h" +``` + +Declare a static constant with the name of the graph, the input stream and the +output stream: + +``` +static NSString* const kGraphName = @"mobile_gpu"; + +static const char* kInputStream = "input_video"; +static const char* kOutputStream = "output_video"; +``` + +Add the following property to the interface of the `ViewController`: + +``` +// The MediaPipe graph currently in use. Initialized in viewDidLoad, started in viewWillAppear: and +// sent video frames on _videoQueue. +@property(nonatomic) MPPGraph* mediapipeGraph; +``` + +As explained in the comment above, we will initialize this graph in +`viewDidLoad` first. To do so, we need to load the graph from the `.pbtxt` file +using the following function: + +``` ++ (MPPGraph*)loadGraphFromResource:(NSString*)resource { + // Load the graph config resource. + NSError* configLoadError = nil; + NSBundle* bundle = [NSBundle bundleForClass:[self class]]; + if (!resource || resource.length == 0) { + return nil; + } + NSURL* graphURL = [bundle URLForResource:resource withExtension:@"binarypb"]; + NSData* data = [NSData dataWithContentsOfURL:graphURL options:0 error:&configLoadError]; + if (!data) { + NSLog(@"Failed to load MediaPipe graph config: %@", configLoadError); + return nil; + } + + // Parse the graph config resource into mediapipe::CalculatorGraphConfig proto object. + mediapipe::CalculatorGraphConfig config; + config.ParseFromArray(data.bytes, data.length); + + // Create MediaPipe graph with mediapipe::CalculatorGraphConfig proto object. + MPPGraph* newGraph = [[MPPGraph alloc] initWithGraphConfig:config]; + [newGraph addFrameOutputStream:kOutputStream outputPacketType:MPPPacketTypePixelBuffer]; + return newGraph; +} +``` + +Use this function to initialize the graph in `viewDidLoad` as follows: + +``` +self.mediapipeGraph = [[self class] loadGraphFromResource:kGraphName]; +``` + +The graph should send the results of processing camera frames back to the +`ViewController`. Add the following line after initializing the graph to set the +`ViewController` as a delegate of the `mediapipeGraph` object: + +``` +self.mediapipeGraph.delegate = self; +``` + +To avoid memory contention while processing frames from the live video feed, add +the following line: + +``` +// Set maxFramesInFlight to a small value to avoid memory contention for real-time processing. +self.mediapipeGraph.maxFramesInFlight = 2; +``` + +Now, start the graph when the user has granted the permission to use the camera +in our app: + +``` +[_cameraSource requestCameraAccessWithCompletionHandler:^void(BOOL granted) { + if (granted) { + // Start running self.mediapipeGraph. + NSError* error; + if (![self.mediapipeGraph startWithError:&error]) { + NSLog(@"Failed to start graph: %@", error); + } + else if (![self.mediapipeGraph waitUntilIdleWithError:&error]) { + NSLog(@"Failed to complete graph initial run: %@", error); + } + + dispatch_async(_videoQueue, ^{ + [_cameraSource start]; + }); + } +}]; +``` + +Note: It is important to start the graph before starting the camera and wait +until completion, so that the graph is ready to process frames as soon as the +camera starts sending them. + +Earlier, when we received frames from the camera in the `processVideoFrame` +function, we displayed them in the `_liveView` using the `_renderer`. Now, we +need to send those frames to the graph and render the results instead. Modify +this function's implementation to do the following: + +``` +- (void)processVideoFrame:(CVPixelBufferRef)imageBuffer + timestamp:(CMTime)timestamp + fromSource:(MPPInputSource*)source { + if (source != _cameraSource) { + NSLog(@"Unknown source: %@", source); + return; + } + [self.mediapipeGraph sendPixelBuffer:imageBuffer + intoStream:kInputStream + packetType:MPPPacketTypePixelBuffer]; +} +``` + +We send the `imageBuffer` to `self.mediapipeGraph` as a packet of type +`MPPPacketTypePixelBuffer` into the input stream `kInputStream`, i.e. +"input_video". + +The graph will run with this input packet and output a result in +`kOutputStream`, i.e. "output_video". We can implement the following delegate +method to receive packets on this output stream and display them on the screen: + +``` +- (void)mediapipeGraph:(MPPGraph*)graph + didOutputPixelBuffer:(CVPixelBufferRef)pixelBuffer + fromStream:(const std::string&)streamName { + if (streamName == kOutputStream) { + // Display the captured image on the screen. + CVPixelBufferRetain(pixelBuffer); + dispatch_async(dispatch_get_main_queue(), ^{ + [_renderer renderPixelBuffer:pixelBuffer]; + CVPixelBufferRelease(pixelBuffer); + }); + } +} +``` + +Update the interface definition of `ViewController` with `MPPGraphDelegate`: + +``` +@interface ViewController () +``` + +And that is all! Build and run the app on your iOS device. You should see the +results of running the edge detection graph on a live video feed. Congrats! + +![edge_detection_ios_gpu_gif](https://mediapipe.dev/images/mobile/edge_detection_ios_gpu.gif) + +Please note that the iOS examples now use a [common] template app. The code in +this tutorial is used in the [common] template app. The [helloworld] app has the +appropriate `BUILD` file dependencies for the edge detection graph. + +[Bazel]:https://bazel.build/ +[`edge_detection_mobile_gpu.pbtxt`]:https://github.com/google/mediapipe/tree/master/mediapipe/graphs/edge_detection/edge_detection_mobile_gpu.pbtxt +[common]:https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/common +[helloworld]:https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/helloworld diff --git a/docs/getting_started/help.md b/docs/getting_started/help.md new file mode 100644 index 0000000..2cb6b9e --- /dev/null +++ b/docs/getting_started/help.md @@ -0,0 +1,55 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/help +title: Getting Help +parent: Getting Started +nav_order: 8 +--- + +# Getting Help +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## Technical questions + +For help with technical or algorithmic questions, visit +[Stack Overflow](https://stackoverflow.com/questions/tagged/mediapipe) to find +answers and support from the MediaPipe community. + +## Bugs and feature requests + +To report bugs or make feature requests, +[file an issue on GitHub](https://github.com/google/mediapipe/issues). + +If you open a GitHub issue, here is our policy: + +1. It must be a bug, a feature request, or a significant problem with documentation (for small doc fixes please send a PR instead). +2. The form below must be filled out. + +**Here's why we have that policy**: MediaPipe developers respond to issues. We want to focus on work that benefits the whole community, e.g., fixing bugs and adding features. Support only helps individuals. GitHub also notifies thousands of people when issues are filed. We want them to see you communicating an interesting problem, rather than being redirected to Stack Overflow. + +------------------------ + +### System information +- **Have I written custom code**: +- **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**: +- **Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device**: +- **Bazel version**: +- **Android Studio, NDK, SDK versions (if issue is related to building in mobile dev environment)**: +- **Xcode & Tulsi version (if issue is related to building in mobile dev environment)**: +- **Exact steps to reproduce**: + +### Describe the problem +Describe the problem clearly here. Be sure to convey here why it's a bug in MediaPipe or a feature request. + +### Source code / logs +Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached instead of being pasted into the issue as text. diff --git a/docs/getting_started/install.md b/docs/getting_started/install.md new file mode 100644 index 0000000..b302847 --- /dev/null +++ b/docs/getting_started/install.md @@ -0,0 +1,874 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/install +title: Installation +parent: Getting Started +nav_order: 6 +--- + +# Installation +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +Note: To interoperate with OpenCV, OpenCV 3.x to 4.1 are preferred. OpenCV +2.x currently works but interoperability support may be deprecated in the +future. + +Note: If you plan to use TensorFlow calculators and example apps, there is a +known issue with gcc and g++ version 6.3 and 7.3. Please use other versions. + +Note: To make Mediapipe work with TensorFlow, please set Python 3.7 as the +default Python version and install the Python "six" library by running `pip3 +install --user six`. + +## Installing on Debian and Ubuntu + +1. Install Bazelisk. + + Follow the official + [Bazel documentation](https://docs.bazel.build/versions/master/install-bazelisk.html) + to install Bazelisk. + +2. Checkout MediaPipe repository. + + ```bash + $ cd $HOME + $ git clone --depth 1 https://github.com/google/mediapipe.git + + # Change directory into MediaPipe root directory + $ cd mediapipe + ``` + +3. Install OpenCV and FFmpeg. + + **Option 1**. Use package manager tool to install the pre-compiled OpenCV + libraries. FFmpeg will be installed via `libopencv-video-dev`. + + OS | OpenCV + -------------------- | ------ + Debian 9 (stretch) | 2.4 + Debian 10 (buster) | 3.2 + Debian 11 (bullseye) | 4.5 + Ubuntu 16.04 LTS | 2.4 + Ubuntu 18.04 LTS | 3.2 + Ubuntu 20.04 LTS | 4.2 + Ubuntu 20.04 LTS | 4.2 + Ubuntu 21.04 | 4.5 + + ```bash + $ sudo apt-get install -y \ + libopencv-core-dev \ + libopencv-highgui-dev \ + libopencv-calib3d-dev \ + libopencv-features2d-dev \ + libopencv-imgproc-dev \ + libopencv-video-dev + ``` + + **Note**. On Debian 11/Ubuntu 21.04 where OpenCV 4.5 is installed with + `libopencv-video-dev`, `libopencv-contrib-dev` should also be installed. + + ```bash + $ sudo apt-get install -y libopencv-contrib-dev + ``` + + MediaPipe's [`opencv_linux.BUILD`] and [`WORKSPACE`] are already configured + for OpenCV 2/3 and should work correctly on any architecture: + + ```bash + # WORKSPACE + new_local_repository( + name = "linux_opencv", + build_file = "@//third_party:opencv_linux.BUILD", + path = "/usr", + ) + + # opencv_linux.BUILD for OpenCV 2/3 installed from Debian package + cc_library( + name = "opencv", + linkopts = [ + "-l:libopencv_core.so", + "-l:libopencv_calib3d.so", + "-l:libopencv_features2d.so", + "-l:libopencv_highgui.so", + "-l:libopencv_imgcodecs.so", + "-l:libopencv_imgproc.so", + "-l:libopencv_video.so", + "-l:libopencv_videoio.so", + ], + ) + ``` + + For OpenCV 4 you need to modify [`opencv_linux.BUILD`] taking into account + current architecture: + + ```bash + # WORKSPACE + new_local_repository( + name = "linux_opencv", + build_file = "@//third_party:opencv_linux.BUILD", + path = "/usr", + ) + + # opencv_linux.BUILD for OpenCV 4 installed from Debian package + cc_library( + name = "opencv", + hdrs = glob([ + # Uncomment according to your multiarch value (gcc -print-multiarch): + # "include/aarch64-linux-gnu/opencv4/opencv2/cvconfig.h", + # "include/arm-linux-gnueabihf/opencv4/opencv2/cvconfig.h", + # "include/x86_64-linux-gnu/opencv4/opencv2/cvconfig.h", + "include/opencv4/opencv2/**/*.h*", + ]), + includes = [ + # Uncomment according to your multiarch value (gcc -print-multiarch): + # "include/aarch64-linux-gnu/opencv4/", + # "include/arm-linux-gnueabihf/opencv4/", + # "include/x86_64-linux-gnu/opencv4/", + "include/opencv4/", + ], + linkopts = [ + "-l:libopencv_core.so", + "-l:libopencv_calib3d.so", + "-l:libopencv_features2d.so", + "-l:libopencv_highgui.so", + "-l:libopencv_imgcodecs.so", + "-l:libopencv_imgproc.so", + "-l:libopencv_video.so", + "-l:libopencv_videoio.so", + ], + ) + ``` + + **Option 2**. Run [`setup_opencv.sh`] to automatically build OpenCV from + source and modify MediaPipe's OpenCV config. This option will do all steps + defined in Option 3 automatically. + + **Option 3**. Follow OpenCV's + [documentation](https://docs.opencv.org/3.4.6/d7/d9f/tutorial_linux_install.html) + to manually build OpenCV from source code. + + You may need to modify [`WORKSPACE`] and [`opencv_linux.BUILD`] to point + MediaPipe to your own OpenCV libraries. Assume OpenCV would be installed to + `/usr/local/` which is recommended by default. + + OpenCV 2/3 setup: + + ```bash + # WORKSPACE + new_local_repository( + name = "linux_opencv", + build_file = "@//third_party:opencv_linux.BUILD", + path = "/usr/local", + ) + + # opencv_linux.BUILD for OpenCV 2/3 installed to /usr/local + cc_library( + name = "opencv", + linkopts = [ + "-L/usr/local/lib", + "-l:libopencv_core.so", + "-l:libopencv_calib3d.so", + "-l:libopencv_features2d.so", + "-l:libopencv_highgui.so", + "-l:libopencv_imgcodecs.so", + "-l:libopencv_imgproc.so", + "-l:libopencv_video.so", + "-l:libopencv_videoio.so", + ], + ) + ``` + + OpenCV 4 setup: + + ```bash + # WORKSPACE + new_local_repository( + name = "linux_opencv", + build_file = "@//third_party:opencv_linux.BUILD", + path = "/usr/local", + ) + + # opencv_linux.BUILD for OpenCV 4 installed to /usr/local + cc_library( + name = "opencv", + hdrs = glob([ + "include/opencv4/opencv2/**/*.h*", + ]), + includes = [ + "include/opencv4/", + ], + linkopts = [ + "-L/usr/local/lib", + "-l:libopencv_core.so", + "-l:libopencv_calib3d.so", + "-l:libopencv_features2d.so", + "-l:libopencv_highgui.so", + "-l:libopencv_imgcodecs.so", + "-l:libopencv_imgproc.so", + "-l:libopencv_video.so", + "-l:libopencv_videoio.so", + ], + ) + ``` + + Current FFmpeg setup is defined in [`ffmpeg_linux.BUILD`] and should work + for any architecture: + + ```bash + # WORKSPACE + new_local_repository( + name = "linux_ffmpeg", + build_file = "@//third_party:ffmpeg_linux.BUILD", + path = "/usr" + ) + + # ffmpeg_linux.BUILD for FFmpeg installed from Debian package + cc_library( + name = "libffmpeg", + linkopts = [ + "-l:libavcodec.so", + "-l:libavformat.so", + "-l:libavutil.so", + ], + ) + ``` + +4. For running desktop examples on Linux only (not on OS X) with GPU + acceleration. + + ```bash + # Requires a GPU with EGL driver support. + # Can use mesa GPU libraries for desktop, (or Nvidia/AMD equivalent). + sudo apt-get install mesa-common-dev libegl1-mesa-dev libgles2-mesa-dev + + # To compile with GPU support, replace + --define MEDIAPIPE_DISABLE_GPU=1 + # with + --copt -DMESA_EGL_NO_X11_HEADERS --copt -DEGL_NO_X11 + # when building GPU examples. + ``` + +5. Run the [Hello World! in C++ example](./hello_world_cpp.md). + + ```bash + $ export GLOG_logtostderr=1 + + # if you are running on Linux desktop with CPU only + $ bazel run --define MEDIAPIPE_DISABLE_GPU=1 \ + mediapipe/examples/desktop/hello_world:hello_world + + # If you are running on Linux desktop with GPU support enabled (via mesa drivers) + $ bazel run --copt -DMESA_EGL_NO_X11_HEADERS --copt -DEGL_NO_X11 \ + mediapipe/examples/desktop/hello_world:hello_world + + # Should print: + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + ``` + +If you run into a build error, please read +[Troubleshooting](./troubleshooting.md) to find the solutions of several common +build issues. + +## Installing on CentOS + +**Disclaimer**: Running MediaPipe on CentOS is experimental. + +1. Install Bazelisk. + + Follow the official + [Bazel documentation](https://docs.bazel.build/versions/master/install-bazelisk.html) + to install Bazelisk. + +2. Checkout MediaPipe repository. + + ```bash + $ git clone --depth 1 https://github.com/google/mediapipe.git + + # Change directory into MediaPipe root directory + $ cd mediapipe + ``` + +3. Install OpenCV. + + Option 1. Use package manager tool to install the pre-compiled version. + + Note: yum installs OpenCV 2.4.5, which may have an opencv/gstreamer + [issue](https://github.com/opencv/opencv/issues/4592). + + ```bash + $ sudo yum install opencv-devel + ``` + + Option 2. Build OpenCV from source code. + + Note: You may need to modify [`WORKSPACE`], [`opencv_linux.BUILD`] and + [`ffmpeg_linux.BUILD`] to point MediaPipe to your own OpenCV and FFmpeg + libraries. For example if OpenCV and FFmpeg are both manually installed in + "/usr/local/", you will need to update: (1) the "linux_opencv" and + "linux_ffmpeg" new_local_repository rules in [`WORKSPACE`], (2) the "opencv" + cc_library rule in [`opencv_linux.BUILD`], and (3) the "libffmpeg" + cc_library rule in [`ffmpeg_linux.BUILD`]. These 3 changes are shown below: + + ```bash + new_local_repository( + name = "linux_opencv", + build_file = "@//third_party:opencv_linux.BUILD", + path = "/usr/local", + ) + + new_local_repository( + name = "linux_ffmpeg", + build_file = "@//third_party:ffmpeg_linux.BUILD", + path = "/usr/local", + ) + + cc_library( + name = "opencv", + srcs = glob( + [ + "lib/libopencv_core.so", + "lib/libopencv_highgui.so", + "lib/libopencv_imgcodecs.so", + "lib/libopencv_imgproc.so", + "lib/libopencv_video.so", + "lib/libopencv_videoio.so", + ], + ), + hdrs = glob([ + # For OpenCV 3.x + "include/opencv2/**/*.h*", + # For OpenCV 4.x + # "include/opencv4/opencv2/**/*.h*", + ]), + includes = [ + # For OpenCV 3.x + "include/", + # For OpenCV 4.x + # "include/opencv4/", + ], + linkstatic = 1, + visibility = ["//visibility:public"], + ) + + cc_library( + name = "libffmpeg", + srcs = glob( + [ + "lib/libav*.so", + ], + ), + hdrs = glob(["include/libav*/*.h"]), + includes = ["include"], + linkopts = [ + "-lavcodec", + "-lavformat", + "-lavutil", + ], + linkstatic = 1, + visibility = ["//visibility:public"], + ) + ``` + +4. Run the [Hello World! in C++ example](./hello_world_cpp.md). + + ```bash + $ export GLOG_logtostderr=1 + # Need bazel flag 'MEDIAPIPE_DISABLE_GPU=1' if you are running on Linux desktop with CPU only + $ bazel run --define MEDIAPIPE_DISABLE_GPU=1 \ + mediapipe/examples/desktop/hello_world:hello_world + + # Should print: + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + ``` + +If you run into a build error, please read +[Troubleshooting](./troubleshooting.md) to find the solutions of several common +build issues. + +## Installing on macOS + +1. Prework: + + * Install [Homebrew](https://brew.sh). + * Install [Xcode](https://developer.apple.com/xcode/) and its Command Line + Tools by `xcode-select --install`. + +2. Install Bazelisk. + + Follow the official + [Bazel documentation](https://docs.bazel.build/versions/master/install-bazelisk.html) + to install Bazelisk. + +3. Checkout MediaPipe repository. + + ```bash + $ git clone --depth 1 https://github.com/google/mediapipe.git + + $ cd mediapipe + ``` + +4. Install OpenCV and FFmpeg. + + Option 1. Use HomeBrew package manager tool to install the pre-compiled + OpenCV 3 libraries. FFmpeg will be installed via OpenCV. + + ```bash + $ brew install opencv@3 + + # There is a known issue caused by the glog dependency. Uninstall glog. + $ brew uninstall --ignore-dependencies glog + ``` + + Option 2. Use MacPorts package manager tool to install the OpenCV libraries. + + ```bash + $ port install opencv + ``` + + Note: when using MacPorts, please edit the [`WORKSPACE`], + [`opencv_macos.BUILD`], and [`ffmpeg_macos.BUILD`] files like the following: + + ```bash + new_local_repository( + name = "macos_opencv", + build_file = "@//third_party:opencv_macos.BUILD", + path = "/opt", + ) + + new_local_repository( + name = "macos_ffmpeg", + build_file = "@//third_party:ffmpeg_macos.BUILD", + path = "/opt", + ) + + cc_library( + name = "opencv", + srcs = glob( + [ + "local/lib/libopencv_core.dylib", + "local/lib/libopencv_highgui.dylib", + "local/lib/libopencv_imgcodecs.dylib", + "local/lib/libopencv_imgproc.dylib", + "local/lib/libopencv_video.dylib", + "local/lib/libopencv_videoio.dylib", + ], + ), + hdrs = glob(["local/include/opencv2/**/*.h*"]), + includes = ["local/include/"], + linkstatic = 1, + visibility = ["//visibility:public"], + ) + + cc_library( + name = "libffmpeg", + srcs = glob( + [ + "local/lib/libav*.dylib", + ], + ), + hdrs = glob(["local/include/libav*/*.h"]), + includes = ["local/include/"], + linkopts = [ + "-lavcodec", + "-lavformat", + "-lavutil", + ], + linkstatic = 1, + visibility = ["//visibility:public"], + ) + ``` + +5. Make sure that Python 3 and the Python "six" library are installed. + + ``` + $ brew install python + $ sudo ln -s -f /usr/local/bin/python3.7 /usr/local/bin/python + $ python --version + Python 3.7.4 + $ pip3 install --user six + ``` + +6. Run the [Hello World! in C++ example](./hello_world_cpp.md). + + ```bash + $ export GLOG_logtostderr=1 + # Need bazel flag 'MEDIAPIPE_DISABLE_GPU=1' as desktop GPU is currently not supported + $ bazel run --define MEDIAPIPE_DISABLE_GPU=1 \ + mediapipe/examples/desktop/hello_world:hello_world + + # Should print: + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + ``` + +If you run into a build error, please read +[Troubleshooting](./troubleshooting.md) to find the solutions of several common +build issues. + +## Installing on Windows + +**Disclaimer**: Running MediaPipe on Windows is experimental. + +Note: building MediaPipe Android apps is still not possible on native +Windows. Please do this in WSL instead and see the WSL setup instruction in the +next section. + +1. Install [MSYS2](https://www.msys2.org/) and edit the `%PATH%` environment + variable. + + If MSYS2 is installed to `C:\msys64`, add `C:\msys64\usr\bin` to your + `%PATH%` environment variable. + +2. Install necessary packages. + + ``` + C:\> pacman -S git patch unzip + ``` + +3. Install Python and allow the executable to edit the `%PATH%` environment + variable. + + Download Python Windows executable from + https://www.python.org/downloads/windows/ and install. + +4. Install Visual C++ Build Tools 2019 and WinSDK + + Go to + [the VisualStudio website](https://visualstudio.microsoft.com/visual-cpp-build-tools), + download build tools, and install Microsoft Visual C++ 2019 Redistributable + and Microsoft Build Tools 2019. + + Download the WinSDK from + [the official MicroSoft website](https://developer.microsoft.com/en-us/windows/downloads/windows-10-sdk/) + and install. + +5. Install Bazel or Bazelisk and add the location of the Bazel executable to + the `%PATH%` environment variable. + + Option 1. Follow + [the official Bazel documentation](https://docs.bazel.build/versions/master/install-windows.html) + to install Bazel 6.1.1 or higher. + + Option 2. Follow the official + [Bazel documentation](https://docs.bazel.build/versions/master/install-bazelisk.html) + to install Bazelisk. + +6. Set Bazel variables. Learn more details about + ["Build on Windows"](https://docs.bazel.build/versions/master/windows.html#build-c-with-msvc) + in the Bazel official documentation. + + ``` + # Please find the exact paths and version numbers from your local version. + C:\> set BAZEL_VS=C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools + C:\> set BAZEL_VC=C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC + C:\> set BAZEL_VC_FULL_VERSION= + C:\> set BAZEL_WINSDK_FULL_VERSION= + ``` + +7. Checkout MediaPipe repository. + + ``` + C:\Users\Username\mediapipe_repo> git clone --depth 1 https://github.com/google/mediapipe.git + + # Change directory into MediaPipe root directory + C:\Users\Username\mediapipe_repo> cd mediapipe + ``` + +8. Install OpenCV. + + Download the Windows executable from https://opencv.org/releases/ and + install. We currently use OpenCV 3.4.10. Remember to edit the [`WORKSPACE`] + file if OpenCV is not installed at `C:\opencv`. + + ``` + new_local_repository( + name = "windows_opencv", + build_file = "@//third_party:opencv_windows.BUILD", + path = "C:\\\\build", + ) + ``` + +9. Run the [Hello World! in C++ example](./hello_world_cpp.md). + + Note: For building MediaPipe on Windows, please add `--action_env + PYTHON_BIN_PATH="C://path//to//python.exe"` to the build command. + Alternatively, you can follow + [issue 724](https://github.com/google/mediapipe/issues/724) to fix the + python configuration manually. + + ``` + C:\Users\Username\mediapipe_repo>bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 --action_env PYTHON_BIN_PATH="C://python_36//python.exe" mediapipe/examples/desktop/hello_world + + C:\Users\Username\mediapipe_repo>set GLOG_logtostderr=1 + + C:\Users\Username\mediapipe_repo>bazel-bin\mediapipe\examples\desktop\hello_world\hello_world.exe + + # should print: + # I20200514 20:43:12.277598 1200 hello_world.cc:56] Hello World! + # I20200514 20:43:12.278597 1200 hello_world.cc:56] Hello World! + # I20200514 20:43:12.279618 1200 hello_world.cc:56] Hello World! + # I20200514 20:43:12.279618 1200 hello_world.cc:56] Hello World! + # I20200514 20:43:12.279618 1200 hello_world.cc:56] Hello World! + # I20200514 20:43:12.279618 1200 hello_world.cc:56] Hello World! + # I20200514 20:43:12.279618 1200 hello_world.cc:56] Hello World! + # I20200514 20:43:12.279618 1200 hello_world.cc:56] Hello World! + # I20200514 20:43:12.279618 1200 hello_world.cc:56] Hello World! + # I20200514 20:43:12.280613 1200 hello_world.cc:56] Hello World! + ``` + +If you run into a build error, please read +[Troubleshooting](./troubleshooting.md) to find the solutions of several common +build issues. + +## Installing on Windows Subsystem for Linux (WSL) + +Note: The pre-built OpenCV packages don't support cameras in WSL. Unless you +[compile](https://funvision.blogspot.com/2019/12/opencv-web-camera-and-video-streams-in.html) +OpenCV with FFMPEG and GStreamer in WSL, the live demos won't work with any +cameras. Alternatively, you use a video file as input. + +1. Follow the + [instruction](https://docs.microsoft.com/en-us/windows/wsl/install-win10) to + install Windows Subsystem for Linux (Ubuntu). + +2. Install Windows ADB and start the ADB server in Windows. + + Note: Windows' and WSL’s adb versions must be the same version, e.g., if WSL + has ADB 1.0.39, you need to download the corresponding Windows ADB from + [here](https://dl.google.com/android/repository/platform-tools_r30.0.3-windows.zip). + +3. Launch WSL. + + Note: All the following steps will be executed in WSL. The Windows directory + of the Linux Subsystem can be found in + C:\Users\YourUsername\AppData\Local\Packages\CanonicalGroupLimited.UbuntuonWindows_SomeID\LocalState\rootfs\home + +4. Install the needed packages. + + ```bash + username@DESKTOP-TMVLBJ1:~$ sudo apt-get update && sudo apt-get install -y build-essential git python zip adb openjdk-8-jdk + ``` + +5. Install Bazelisk. + + Follow the official + [Bazel documentation](https://docs.bazel.build/versions/master/install-bazelisk.html) + to install Bazelisk. + +6. Checkout MediaPipe repository. + + ```bash + username@DESKTOP-TMVLBJ1:~$ git clone --depth 1 https://github.com/google/mediapipe.git + + username@DESKTOP-TMVLBJ1:~$ cd mediapipe + ``` + +7. Install OpenCV and FFmpeg. + + Option 1. Use package manager tool to install the pre-compiled OpenCV + libraries. FFmpeg will be installed via libopencv-video-dev. + + ```bash + username@DESKTOP-TMVLBJ1:~/mediapipe$ sudo apt-get install libopencv-core-dev libopencv-highgui-dev \ + libopencv-calib3d-dev libopencv-features2d-dev \ + libopencv-imgproc-dev libopencv-video-dev + ``` + + Option 2. Run [`setup_opencv.sh`] to automatically build OpenCV from source + and modify MediaPipe's OpenCV config. + + Option 3. Follow OpenCV's + [documentation](https://docs.opencv.org/3.4.6/d7/d9f/tutorial_linux_install.html) + to manually build OpenCV from source code. + + Note: You may need to modify [`WORKSPACE`] and [`opencv_linux.BUILD`] to + point MediaPipe to your own OpenCV libraries, e.g., if OpenCV 4 is installed + in "/usr/local/", you need to update the "linux_opencv" new_local_repository + rule in [`WORKSPACE`] and "opencv" cc_library rule in [`opencv_linux.BUILD`] + like the following: + + ```bash + new_local_repository( + name = "linux_opencv", + build_file = "@//third_party:opencv_linux.BUILD", + path = "/usr/local", + ) + + cc_library( + name = "opencv", + srcs = glob( + [ + "lib/libopencv_core.so", + "lib/libopencv_highgui.so", + "lib/libopencv_imgcodecs.so", + "lib/libopencv_imgproc.so", + "lib/libopencv_video.so", + "lib/libopencv_videoio.so", + ], + ), + hdrs = glob(["include/opencv4/**/*.h*"]), + includes = ["include/opencv4/"], + linkstatic = 1, + visibility = ["//visibility:public"], + ) + ``` + +8. Run the [Hello World! in C++ example](./hello_world_cpp.md). + + ```bash + username@DESKTOP-TMVLBJ1:~/mediapipe$ export GLOG_logtostderr=1 + + # Need bazel flag 'MEDIAPIPE_DISABLE_GPU=1' as desktop GPU is currently not supported + username@DESKTOP-TMVLBJ1:~/mediapipe$ bazel run --define MEDIAPIPE_DISABLE_GPU=1 \ + mediapipe/examples/desktop/hello_world:hello_world + + # Should print: + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + ``` + +If you run into a build error, please +read [Troubleshooting](./troubleshooting.md) to find the solutions of several +common build issues. + +## Installing using Docker + +This will use a Docker image that will isolate mediapipe's installation from the rest of the system. + +1. [Install Docker](https://docs.docker.com/install/#supported-platforms) on + your host system. + +2. Build a docker image with tag "mediapipe". + + ```bash + $ git clone --depth 1 https://github.com/google/mediapipe.git + $ cd mediapipe + $ docker build --tag=mediapipe . + + # Should print: + # Sending build context to Docker daemon 147.8MB + # Step 1/9 : FROM ubuntu:latest + # latest: Pulling from library/ubuntu + # 6abc03819f3e: Pull complete + # 05731e63f211: Pull complete + # ........ + # See http://bazel.build/docs/getting-started.html to start a new project! + # Removing intermediate container 82901b5e79fa + # ---> f5d5f402071b + # Step 9/9 : COPY . /mediapipe/ + # ---> a95c212089c5 + # Successfully built a95c212089c5 + # Successfully tagged mediapipe:latest + ``` + +3. Run the [Hello World! in C++ example](./hello_world_cpp.md). + + ```bash + $ docker run -it --name mediapipe mediapipe:latest + + root@bca08b91ff63:/mediapipe# GLOG_logtostderr=1 bazel run --define MEDIAPIPE_DISABLE_GPU=1 mediapipe/examples/desktop/hello_world + + # Should print: + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + # Hello World! + ``` + +If you run into a build error, please +read [Troubleshooting](./troubleshooting.md) to find the solutions of several +common build issues. + +4. Build a MediaPipe Android example. + + ```bash + $ docker run -it --name mediapipe mediapipe:latest + + root@bca08b91ff63:/mediapipe# bash ./setup_android_sdk_and_ndk.sh + + # Should print: + # Android NDK is now installed. Consider setting $ANDROID_NDK_HOME environment variable to be /root/Android/Sdk/ndk-bundle/android-ndk-r19c + # Set android_ndk_repository and android_sdk_repository in WORKSPACE + # Done + + root@bca08b91ff63:/mediapipe# bazel build -c opt --config=android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetectiongpu:objectdetectiongpu + + # Should print: + # Target //mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetectiongpu:objectdetectiongpu up-to-date: + # bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetectiongpu/objectdetectiongpu_deploy.jar + # bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetectiongpu/objectdetectiongpu_unsigned.apk + # bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objectdetectiongpu/objectdetectiongpu.apk + # INFO: Elapsed time: 144.462s, Critical Path: 79.47s + # INFO: 1958 processes: 1 local, 1863 processwrapper-sandbox, 94 worker. + # INFO: Build completed successfully, 2028 total actions + ``` + + + +[`WORKSPACE`]: https://github.com/google/mediapipe/blob/master/WORKSPACE +[`opencv_linux.BUILD`]: https://github.com/google/mediapipe/tree/master/third_party/opencv_linux.BUILD +[`ffmpeg_linux.BUILD`]:https://github.com/google/mediapipe/tree/master/third_party/ffmpeg_linux.BUILD +[`opencv_macos.BUILD`]: https://github.com/google/mediapipe/tree/master/third_party/opencv_macos.BUILD +[`ffmpeg_macos.BUILD`]:https://github.com/google/mediapipe/tree/master/third_party/ffmpeg_macos.BUILD +[`setup_opencv.sh`]: https://github.com/google/mediapipe/blob/master/setup_opencv.sh diff --git a/docs/getting_started/ios.md b/docs/getting_started/ios.md new file mode 100644 index 0000000..ad529b0 --- /dev/null +++ b/docs/getting_started/ios.md @@ -0,0 +1,241 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/ios +title: MediaPipe on iOS +parent: Getting Started +has_children: true +has_toc: false +nav_order: 2 +--- + +# MediaPipe on iOS +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +Please follow instructions below to build iOS example apps in the supported +MediaPipe [solutions](../solutions/solutions.md). To learn more about these +example apps, start from, start from +[Hello World! on iOS](./hello_world_ios.md). + +## Building iOS example apps + +### Prerequisite + +1. Install MediaPipe following these [instructions](./install.md). + +2. Install [Xcode](https://developer.apple.com/xcode/), then install the + Command Line Tools using: + + ```bash + xcode-select --install + ``` + +3. Install [Bazelisk](https://github.com/bazelbuild/bazelisk) +. + + We recommend using [Homebrew](https://brew.sh/) to get the latest versions. + + ```bash + brew install bazelisk + ``` + +4. Set Python 3.7 as the default Python version and install the Python "six" + library. This is needed for TensorFlow. + + ```bash + pip3 install --user six + ``` + +5. Clone the MediaPipe repository. + + ```bash + git clone https://github.com/google/mediapipe.git + ``` + +### Set up a bundle ID prefix + +All iOS apps must have a bundle ID, and you must have a provisioning profile +that lets you install an app with that ID onto your phone. To avoid clashes +between different MediaPipe users, you need to configure a unique prefix for the +bundle IDs of our iOS demo apps. + +If you have a custom provisioning profile, see +[Custom provisioning](#custom-provisioning) below. + +Otherwise, run this command to generate a unique prefix: + +```bash +python3 mediapipe/examples/ios/link_local_profiles.py +``` + +### Create an Xcode project + +This allows you to edit and debug one of the example apps in Xcode. It also +allows you to make use of automatic provisioning (see later section). + +1. We will use a tool called [Tulsi](https://tulsi.bazel.build/) for generating + Xcode projects from Bazel build configurations. + + ```bash + # cd out of the mediapipe directory, then: + git clone https://github.com/bazelbuild/tulsi.git + cd tulsi + # remove Xcode version from Tulsi's .bazelrc (see http://github.com/bazelbuild/tulsi#building-and-installing): + sed -i .orig '/xcode_version/d' .bazelrc + # build and run Tulsi: + sh build_and_run.sh + ``` + + This will install `Tulsi.app` inside the `Applications` directory in your + home directory. + + **Note**: Please ensure the `xcode_version` in the + [`build_and_run.sh`](https://github.com/bazelbuild/tulsi/blob/b1d0108e6a93dbe8ab01529b2c607b6b651f0759/build_and_run.sh#L26) + file in tulsi repo is the same version as installed in your system. + +2. Open `mediapipe/Mediapipe.tulsiproj` using the Tulsi app. + + Tip: If Tulsi displays an error saying "Bazel could not be found", press the + "Bazel..." button in the Packages tab and select the `bazel` executable in + your homebrew `/bin/` directory. + +3. Select the MediaPipe config in the Configs tab, then press the Generate + button below. You will be asked for a location to save the Xcode project. + Once the project is generated, it will be opened in Xcode. + + If you get an error about bundle IDs, see the + [previous section](#set-up-a-bundle-id-prefix). + +### Set up provisioning + +To install applications on an iOS device, you need a provisioning profile. There +are two options: + +1. Automatic provisioning. This allows you to build and install an app on your + personal device. The provisioning profile is managed by Xcode, and has to be + updated often (it is valid for about a week). + +2. Custom provisioning. This uses a provisioning profile associated with an + Apple developer account. These profiles have a longer validity period and + can target multiple devices, but you need a paid developer account with + Apple to obtain one. + +#### Automatic provisioning + +1. Create an Xcode project for MediaPipe, as discussed + [earlier](#create-an-xcode-project). + +2. In the project navigator in the left sidebar, select the "Mediapipe" + project. + +3. Select one of the application targets, e.g. HandTrackingGpuApp. + +4. Select the "Signing & Capabilities" tab. + +5. Check "Automatically manage signing", and confirm the dialog box. + +6. Select "_Your Name_ (Personal Team)" in the Team pop-up menu. + +7. This set-up needs to be done once for each application you want to install. + Repeat steps 3-6 as needed. + +This generates provisioning profiles for each app you have selected. Now we need +to tell Bazel to use them. We have provided a script to make this easier. + +1. In the terminal, to the `mediapipe` directory where you cloned the + repository. + +2. Run this command: + + ```bash + python3 mediapipe/examples/ios/link_local_profiles.py + ``` + +This will find and link the provisioning profile for all applications for which +you have enabled automatic provisioning in Xcode. + +Note: once a profile expires, Xcode will generate a new one; you must then run +this script again to link the updated profiles. + +#### Custom provisioning + +1. Obtain a provisioning profile from Apple. + +Tip: You can use this command to see the provisioning profiles you have +previously downloaded using Xcode: `open ~/Library/MobileDevice/"Provisioning +Profiles"`. If there are none, generate and download a profile on +[Apple's developer site](https://developer.apple.com/account/resources/). + +1. Symlink or copy your provisioning profile to + `mediapipe/mediapipe/provisioning_profile.mobileprovision`. + + ```bash + cd mediapipe + ln -s ~/Downloads/MyProvisioningProfile.mobileprovision mediapipe/provisioning_profile.mobileprovision + ``` + +Note: if you had previously set up automatic provisioning, you should remove the +`provisioning_profile.mobileprovision` symlink in each example's directory, +since it will take precedence over the common one. You can also overwrite it +with your own profile if you need a different profile for different apps. + +1. Open `mediapipe/examples/ios/bundle_id.bzl`, and change the + `BUNDLE_ID_PREFIX` to a prefix associated with your provisioning profile. + +### Build and run an app using Xcode + +1. Create the Xcode project, and make sure you have set up either automatic or + custom provisioning. + +2. You can now select any of the MediaPipe demos in the target menu, and build + and run them as normal. + +Note: When you ask Xcode to run an app, by default it will use the Debug +configuration. Some of our demos are computationally heavy; you may want to use +the Release configuration for better performance. + +Note: Due to an incompatibility caused by one of our dependencies, MediaPipe +cannot be used for apps running on the iPhone Simulator on Apple Silicon (M1). + +Tip: To switch build configuration in Xcode, click on the target menu, choose +"Edit Scheme...", select the Run action, and switch the Build Configuration from +Debug to Release. Note that this is set independently for each target. + +Tip: On the device, in Settings > General > Device Management, make sure the +developer (yourself) is trusted. + +### Build an app using the command line + +1. Make sure you have set up either automatic or custom provisioning. + +2. Using [MediaPipe Hands](../solutions/hands.md) for example, run: + + ```bash + bazel build -c opt --config=ios_arm64 mediapipe/examples/ios/handtrackinggpu:HandTrackingGpuApp + ``` + + You may see a permission request from `codesign` in order to sign the app. + + Tip: If you are using custom provisioning, you can run this + [script](https://github.com/google/mediapipe/blob/master/build_ios_examples.sh) + to build all MediaPipe iOS example apps. + +3. In Xcode, open the `Devices and Simulators` window (command-shift-2). + +4. Make sure your device is connected. You will see a list of installed apps. + Press the "+" button under the list, and select the `.ipa` file built by + Bazel. + +5. You can now run the app on your device. + +Tip: On the device, in Settings > General > Device Management, make sure the +developer (yourself) is trusted. diff --git a/docs/getting_started/javascript.md b/docs/getting_started/javascript.md new file mode 100644 index 0000000..e68d409 --- /dev/null +++ b/docs/getting_started/javascript.md @@ -0,0 +1,109 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/ +title: MediaPipe in JavaScript +parent: Getting Started +nav_order: 4 +--- + +# MediaPipe in JavaScript +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We are moving to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe starting April 3, 2023.* + +---- + +## Ready-to-use JavaScript Solutions + +MediaPipe currently offers the following solutions: + +Solution | NPM Package | Example +--------------------------- | --------------------------------------- | ------- +[Face Mesh][F-pg] | [@mediapipe/face_mesh][F-npm] | [mediapipe.dev/demo/face_mesh][F-demo] +[Face Detection][Fd-pg] | [@mediapipe/face_detection][Fd-npm] | [mediapipe.dev/demo/face_detection][Fd-demo] +[Hands][H-pg] | [@mediapipe/hands][H-npm] | [mediapipe.dev/demo/hands][H-demo] +[Holistic][Ho-pg] | [@mediapipe/holistic][Ho-npm] | [mediapipe.dev/demo/holistic][Ho-demo] +[Objectron][Ob-pg] | [@mediapipe/objectron][Ob-npm] | [mediapipe.dev/demo/objectron][Ob-demo] +[Pose][P-pg] | [@mediapipe/pose][P-npm] | [mediapipe.dev/demo/pose][P-demo] +[Selfie Segmentation][S-pg] | [@mediapipe/selfie_segmentation][S-npm] | [mediapipe.dev/demo/selfie_segmentation][S-demo] + +Click on a solution link above for more information, including API and code +snippets. + +### Supported platforms: + +| Browser | Platform | Notes | +| ------- | ----------------------- | -------------------------------------- | +| Chrome | Android / Windows / Mac | Pixel 4 and older unsupported. Fuchsia | +| | | unsupported. | +| Chrome | iOS | Camera unavailable in Chrome on iOS. | +| Safari | iPad/iPhone/Mac | iOS and Safari on iPad / iPhone / | +| | | MacBook | + +The quickest way to get acclimated is to look at the examples above. Each demo +has a link to a [CodePen][codepen] so that you can edit the code and try it +yourself. We have included a number of utility packages to help you get started: + +* [@mediapipe/drawing_utils][draw-npm] - Utilities to draw landmarks and + connectors. +* [@mediapipe/camera_utils][cam-npm] - Utilities to operate the camera. +* [@mediapipe/control_utils][ctrl-npm] - Utilities to show sliders and FPS + widgets. + +Note: See these demos and more at [MediaPipe on CodePen][codepen] + +All of these solutions are staged in [NPM][npm]. You can install any package +locally with `npm install`. Example: + +``` +npm install @mediapipe/holistic. +``` + +If you would rather not stage these locally, you can rely on a CDN (e.g., +[jsDelivr](https://www.jsdelivr.com/)). This will allow you to add scripts +directly to your HTML: + +``` + + + + +``` + +Note: You can specify version numbers to both NPM and jsdelivr. They are +structured as `..`. To prevent breaking changes from +affecting your work, restrict your request to a `` number. e.g., +`@mediapipe/holistic@0.1`. + +[Ho-pg]: ../solutions/holistic#javascript-solution-api +[F-pg]: ../solutions/face_mesh#javascript-solution-api +[Fd-pg]: ../solutions/face_detection#javascript-solution-api +[H-pg]: ../solutions/hands#javascript-solution-api +[Ob-pg]: ../solutions/objectron#javascript-solution-api +[P-pg]: ../solutions/pose#javascript-solution-api +[S-pg]: ../solutions/selfie_segmentation#javascript-solution-api +[Ho-npm]: https://www.npmjs.com/package/@mediapipe/holistic +[F-npm]: https://www.npmjs.com/package/@mediapipe/face_mesh +[Fd-npm]: https://www.npmjs.com/package/@mediapipe/face_detection +[H-npm]: https://www.npmjs.com/package/@mediapipe/hands +[Ob-npm]: https://www.npmjs.com/package/@mediapipe/objectron +[P-npm]: https://www.npmjs.com/package/@mediapipe/pose +[S-npm]: https://www.npmjs.com/package/@mediapipe/selfie_segmentation +[draw-npm]: https://www.npmjs.com/package/@mediapipe/drawing_utils +[cam-npm]: https://www.npmjs.com/package/@mediapipe/camera_utils +[ctrl-npm]: https://www.npmjs.com/package/@mediapipe/control_utils +[Ho-demo]: https://mediapipe.dev/demo/holistic +[F-demo]: https://mediapipe.dev/demo/face_mesh +[Fd-demo]: https://mediapipe.dev/demo/face_detection +[H-demo]: https://mediapipe.dev/demo/hands +[Ob-demo]: https://mediapipe.dev/demo/objectron +[P-demo]: https://mediapipe.dev/demo/pose +[S-demo]: https://mediapipe.dev/demo/selfie_segmentation +[npm]: https://www.npmjs.com/package/@mediapipe +[codepen]: https://code.mediapipe.dev/codepen diff --git a/docs/getting_started/python.md b/docs/getting_started/python.md new file mode 100644 index 0000000..43f452a --- /dev/null +++ b/docs/getting_started/python.md @@ -0,0 +1,151 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/ +title: MediaPipe in Python +parent: Getting Started +has_children: true +has_toc: false +nav_order: 3 +--- + +# MediaPipe in Python +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## Ready-to-use Python Solutions + +MediaPipe offers ready-to-use yet customizable Python solutions as a prebuilt +Python package. MediaPipe Python package is available on +[PyPI](https://pypi.org/project/mediapipe/) for Linux, macOS and Windows. + +You can, for instance, activate a Python virtual environment: + +```bash +$ python3 -m venv mp_env && source mp_env/bin/activate +``` + +Install MediaPipe Python package and start Python interpreter: + +```bash +(mp_env)$ pip install mediapipe +(mp_env)$ python3 +``` + +In Python interpreter, import the package and start using one of the solutions: + +```python +import mediapipe as mp +mp_face_mesh = mp.solutions.face_mesh +``` + +Tip: Use command `deactivate` to later exit the Python virtual environment. + +To learn more about configuration options and usage examples, please find +details in each solution via the links below: + +* [MediaPipe Face Detection](../solutions/face_detection#python-solution-api) +* [MediaPipe Face Mesh](../solutions/face_mesh#python-solution-api) +* [MediaPipe Hands](../solutions/hands#python-solution-api) +* [MediaPipe Holistic](../solutions/holistic#python-solution-api) +* [MediaPipe Objectron](../solutions/objectron#python-solution-api) +* [MediaPipe Pose](../solutions/pose#python-solution-api) +* [MediaPipe Selfie Segmentation](../solutions/selfie_segmentation#python-solution-api) + +## MediaPipe on Google Colab + +* [MediaPipe Face Detection Colab](https://mediapipe.page.link/face_detection_py_colab) +* [MediaPipe Face Mesh Colab](https://mediapipe.page.link/face_mesh_py_colab) +* [MediaPipe Hands Colab](https://mediapipe.page.link/hands_py_colab) +* [MediaPipe Holistic Colab](https://mediapipe.page.link/holistic_py_colab) +* [MediaPipe Objectron Colab](https://mediapipe.page.link/objectron_py_colab) +* [MediaPipe Pose Colab](https://mediapipe.page.link/pose_py_colab) +* [MediaPipe Pose Classification Colab (Basic)](https://mediapipe.page.link/pose_classification_basic) +* [MediaPipe Pose Classification Colab (Extended)](https://mediapipe.page.link/pose_classification_extended) +* [MediaPipe Selfie Segmentation Colab](https://mediapipe.page.link/selfie_segmentation_py_colab) + +## MediaPipe Python Framework + +The ready-to-use solutions are built upon the MediaPipe Python framework, which +can be used by advanced users to run their own MediaPipe graphs in Python. +Please see [here](./python_framework.md) for more info. + +## Building MediaPipe Python Package + +Follow the steps below only if you have local changes and need to build the +Python package from source. Otherwise, we strongly encourage our users to simply +run `pip install mediapipe` to use the ready-to-use solutions, more convenient +and much faster. + +MediaPipe PyPI currently doesn't provide aarch64 Python wheel +files. For building and using MediaPipe Python on aarch64 Linux systems such as +Nvidia Jetson and Raspberry Pi, please read +[here](https://github.com/jiuqiant/mediapipe-python-aarch64). + +1. Make sure that Bazel and OpenCV are correctly installed and configured for + MediaPipe. Please see [Installation](./install.md) for how to setup Bazel + and OpenCV for MediaPipe on Linux and macOS. + +2. Install the following dependencies. + + Debian or Ubuntu: + + ```bash + $ sudo apt install python3-dev + $ sudo apt install python3-venv + $ sudo apt install -y protobuf-compiler + + # If you need to build opencv from source. + $ sudo apt install cmake + ``` + + macOS: + + ```bash + $ brew install protobuf + + # If you need to build opencv from source. + $ brew install cmake + ``` + + Windows: + + Download the latest protoc win64 zip from + [the Protobuf GitHub repo](https://github.com/protocolbuffers/protobuf/releases), + unzip the file, and copy the protoc.exe executable to a preferred location. + Please ensure that location is added into the Path environment variable. + +3. Activate a Python virtual environment. + + ```bash + $ python3 -m venv mp_env && source mp_env/bin/activate + ``` + +4. In the virtual environment, go to the MediaPipe repo directory. + +5. Install the required Python packages. + + ```bash + (mp_env)mediapipe$ pip3 install -r requirements.txt + ``` + +6. Build and install MediaPipe package. + + ```bash + (mp_env)mediapipe$ python3 setup.py install --link-opencv + ``` + + or + + ```bash + (mp_env)mediapipe$ python3 setup.py bdist_wheel + ``` +7. Exit from the MediaPipe repo directory and launch the Python interpreter. diff --git a/docs/getting_started/python_framework.md b/docs/getting_started/python_framework.md new file mode 100644 index 0000000..60f3878 --- /dev/null +++ b/docs/getting_started/python_framework.md @@ -0,0 +1,274 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/python_framework +parent: MediaPipe in Python +grand_parent: Getting Started +nav_order: 1 +--- + +# MediaPipe Python Framework +{: .no_toc } + +1. TOC +{:toc} +--- +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +The MediaPipe Python framework grants direct access to the core components of +the MediaPipe C++ framework such as Timestamp, Packet, and CalculatorGraph, +whereas the +[ready-to-use Python solutions](./python.md#ready-to-use-python-solutions) hide +the technical details of the framework and simply return the readable model +inference results back to the callers. + +MediaPipe framework sits on top of +[the pybind11 library](https://pybind11.readthedocs.io/en/stable/index.html). +The C++ core framework is exposed in Python via a C++/Python language binding. +The content below assumes that the reader already has a basic understanding of +the MediaPipe C++ framework. Otherwise, you can find useful information in +[Framework Concepts](../framework_concepts/framework_concepts.md). + +### Packet + +The packet is the basic data flow unit in MediaPipe. A packet consists of a +numeric timestamp and a shared pointer to an immutable payload. In Python, a +MediaPipe packet can be created by calling one of the packet creator methods in +the +[`mp.packet_creator`](https://github.com/google/mediapipe/tree/master/mediapipe/python/pybind/packet_creator.cc) +module. Correspondingly, the packet payload can be retrieved by using one of the +packet getter methods in the +[`mp.packet_getter`](https://github.com/google/mediapipe/tree/master/mediapipe/python/pybind/packet_getter.cc) +module. Note that the packet payload becomes **immutable** after packet +creation. Thus, the modification of the retrieved packet content doesn't affect +the actual payload in the packet. MediaPipe framework Python API supports the +most commonly used data types of MediaPipe (e.g., ImageFrame, Matrix, Protocol +Buffers, and the primitive data types) in the core binding. The comprehensive +table below shows the type mappings between the Python and the C++ data type +along with the packet creator and the content getter method for each data type +supported by the MediaPipe Python framework API. + +Python Data Type | C++ Data Type | Packet Creator | Content Getter +------------------------------------ | ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------------- +bool | bool | create_bool(True) | get_bool(packet) +int or np.intc | int_t | create_int(1) | get_int(packet) +int or np.int8 | int8_t | create_int8(2**7-1) | get_int(packet) +int or np.int16 | int16_t | create_int16(2**15-1) | get_int(packet) +int or np.int32 | int32_t | create_int32(2**31-1) | get_int(packet) +int or np.int64 | int64_t | create_int64(2**63-1) | get_int(packet) +int or np.uint8 | uint8_t | create_uint8(2**8-1) | get_uint(packet) +int or np.uint16 | uint16_t | create_uint16(2**16-1) | get_uint(packet) +int or np.uint32 | uint32_t | create_uint32(2**32-1) | get_uint(packet) +int or np.uint64 | uint64_t | create_uint64(2**64-1) | get_uint(packet) +float or np.float32 | float | create_float(1.1) | get_float(packet) +float or np.double | double | create_double(1.1) | get_float(packet) +str (UTF-8) | std::string | create_string('abc') | get_str(packet) +bytes | std::string | create_string(b'\xd0\xd0\xd0') | get_bytes(packet) +mp.Packet | mp::Packet | create_packet(p) | get_packet(packet) +List\[bool\] | std::vector\ | create_bool_vector(\[True, False\]) | get_bool_list(packet) +List\[int\] or List\[np.intc\] | int\[\] | create_int_array(\[1, 2, 3\]) | get_int_list(packet, size=10) +List\[int\] or List\[np.intc\] | std::vector\ | create_int_vector(\[1, 2, 3\]) | get_int_list(packet) +List\[float\] or List\[np.float\] | float\[\] | create_float_arrary(\[0.1, 0.2\]) | get_float_list(packet, size=10) +List\[float\] or List\[np.float\] | std::vector\ | create_float_vector(\[0.1, 0.2\]) | get_float_list(packet, size=10) +List\[str\] | std::vector\ | create_string_vector(\['a'\]) | get_str_list(packet) +List\[mp.Packet\] | std::vector\ | create_packet_vector(
        \[packet1, packet2\]) | get_packet_list(p) +Mapping\[str, Packet\] | std::map | create_string_to_packet_map(
        {'a': packet1, 'b': packet2}) | get_str_to_packet_dict(packet) +np.ndarray
(cv.mat and PIL.Image) | mp::ImageFrame | create_image_frame(
        format=ImageFormat.SRGB,
        data=mat) | get_image_frame(packet) +np.ndarray | mp::Matrix | create_matrix(data) | get_matrix(packet) +Google Proto Message | Google Proto Message | create_proto(proto) | get_proto(packet) +List\[Proto\] | std::vector\ | n/a | get_proto_list(packet) + +It's not uncommon that users create custom C++ classes and send those into +the graphs and calculators. To allow the custom classes to be used in Python +with MediaPipe, you may extend the Packet API for a new data type in the +following steps: + +1. Write the pybind11 + [class binding code](https://pybind11.readthedocs.io/en/stable/advanced/classes.html) + or + [a custom type caster](https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html?highlight=custom%20type%20caster) + for the custom type in a cc file. + + ```c++ + #include "path/to/my_type/header/file.h" + #include "pybind11/pybind11.h" + + namespace py = pybind11; + + PYBIND11_MODULE(my_type_binding, m) { + // Write binding code or a custom type caster for MyType. + py::class_(m, "MyType") + .def(py::init<>()) + .def(...); + } + ``` + +2. Create a new packet creator and getter method of the custom type in a + separate cc file. + + ```c++ + #include "path/to/my_type/header/file.h" + #include "mediapipe/framework/packet.h" + #include "pybind11/pybind11.h" + + namespace mediapipe { + namespace py = pybind11; + + PYBIND11_MODULE(my_packet_methods, m) { + m.def( + "create_my_type", + [](const MyType& my_type) { return MakePacket(my_type); }); + + m.def( + "get_my_type", + [](const Packet& packet) { + if(!packet.ValidateAsType().ok()) { + PyErr_SetString(PyExc_ValueError, "Packet data type mismatch."); + return py::error_already_set(); + } + return packet.Get(); + }); + } + } // namespace mediapipe + ``` + +3. Add two bazel build rules for the custom type binding and the new packet + methods in the BUILD file. + + ``` + load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") + + pybind_extension( + name = "my_type_binding", + srcs = ["my_type_binding.cc"], + deps = [":my_type"], + ) + + pybind_extension( + name = "my_packet_methods", + srcs = ["my_packet_methods.cc"], + deps = [ + ":my_type", + "//mediapipe/framework:packet" + ], + ) + ``` + +4. Build the pybind extension targets (with the suffix .so) by Bazel and move the generated dynamic libraries into one of the $LD_LIBRARY_PATH dirs. + +5. Use the binding modules in Python. + + ```python + import my_type_binding + import my_packet_methods + + packet = my_packet_methods.create_my_type(my_type_binding.MyType()) + my_type = my_packet_methods.get_my_type(packet) + ``` + +### Timestamp + +Each packet contains a timestamp that is in units of microseconds. In Python, +the Packet API provides a convenience method `packet.at()` to define the numeric +timestamp of a packet. More generally, `packet.timestamp` is the packet class +property for accessing the underlying timestamp. To convert an Unix epoch to a +MediaPipe timestamp, +[the Timestamp API](https://github.com/google/mediapipe/tree/master/mediapipe/python/pybind/timestamp.cc) +offers a method `mp.Timestamp.from_seconds()` for this purpose. + +### ImageFrame + +ImageFrame is the container for storing an image or a video frame. Formats +supported by ImageFrame are listed in +[the ImageFormat enum](https://github.com/google/mediapipe/tree/master/mediapipe/python/pybind/image_frame.cc#l=170). +Pixels are encoded row-major with interleaved color components, and ImageFrame +supports uint8, uint16, and float as its data types. MediaPipe provides +[an ImageFrame Python API](https://github.com/google/mediapipe/tree/master/mediapipe/python/pybind/image_frame.cc) +to access the ImageFrame C++ class. In Python, the easiest way to retrieve the +pixel data is to call `image_frame.numpy_view()` to get a numpy ndarray. Note +that the returned numpy ndarray, a reference to the internal pixel data, is +unwritable. If the callers need to modify the numpy ndarray, it's required to +explicitly call a copy operation to obtain a copy. When MediaPipe takes a numpy +ndarray to make an ImageFrame, it assumes that the data is stored contiguously. +Correspondingly, the pixel data of an ImageFrame will be realigned to be +contiguous when it's returned to the Python side. + +### Graph + +In MediaPipe, all processing takes places within the context of a +CalculatorGraph. +[The CalculatorGraph Python API](https://github.com/google/mediapipe/tree/master/mediapipe/python/pybind/calculator_graph.cc) +is a direct binding to the C++ CalculatorGraph class. The major difference is +the CalculatorGraph Python API raises a Python error instead of returning a +non-OK Status when an error occurs. Therefore, as a Python user, you can handle +the exceptions as you normally do. The life cycle of a CalculatorGraph contains +three stages: initialization and setup, graph run, and graph shutdown. + +1. Initialize a CalculatorGraph with a CalculatorGraphConfig protobuf or binary + protobuf file, and provide callback method(s) to observe the output + stream(s). + + Option 1. Initialize a CalculatorGraph with a CalculatorGraphConfig protobuf + or its text representation, and observe the output stream(s): + + ```python + import mediapipe as mp + + config_text = """ + input_stream: 'in_stream' + output_stream: 'out_stream' + node { + calculator: 'PassThroughCalculator' + input_stream: 'in_stream' + output_stream: 'out_stream' + } + """ + graph = mp.CalculatorGraph(graph_config=config_text) + output_packets = [] + graph.observe_output_stream( + 'out_stream', + lambda stream_name, packet: + output_packets.append(mp.packet_getter.get_str(packet))) + ``` + + Option 2. Initialize a CalculatorGraph with a binary protobuf file, and + observe the output stream(s). + + ```python + import mediapipe as mp + # resources dependency + + graph = mp.CalculatorGraph( + binary_graph=os.path.join( + resources.GetRunfilesDir(), 'path/to/your/graph.binarypb')) + graph.observe_output_stream( + 'out_stream', + lambda stream_name, packet: print(f'Get {packet} from {stream_name}')) + ``` + +2. Start the graph run and feed packets into the graph. + + ```python + graph.start_run() + + graph.add_packet_to_input_stream( + 'in_stream', mp.packet_creator.create_string('abc').at(0)) + + rgb_img = cv2.cvtColor(cv2.imread('/path/to/your/image.png'), cv2.COLOR_BGR2RGB) + graph.add_packet_to_input_stream( + 'in_stream', + mp.packet_creator.create_image_frame(image_format=mp.ImageFormat.SRGB, + data=rgb_img).at(1)) + ``` + +3. Close the graph after finish. You may restart the graph for another graph + run after the call to `close()`. + + ```python + graph.close() + ``` + +The Python script can be run by your local Python runtime. diff --git a/docs/getting_started/troubleshooting.md b/docs/getting_started/troubleshooting.md new file mode 100644 index 0000000..e7dff33 --- /dev/null +++ b/docs/getting_started/troubleshooting.md @@ -0,0 +1,284 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/framework/getting_started/troubleshooting +title: Troubleshooting +parent: Getting Started +nav_order: 10 +--- + +# Troubleshooting +{: .no_toc } + +1. TOC +{:toc} +--- + +**Attention:** *Thanks for your interest in MediaPipe! We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +---- + +## Missing Python binary path + +The error message: + +``` +ERROR: An error occurred during the fetch of repository 'local_execution_config_python': + Traceback (most recent call last): + File "/sandbox_path/external/org_tensorflow/third_party/py/python_configure.bzl", line 208 + get_python_bin(repository_ctx) + ... +Repository command failed +``` + +usually indicates that Bazel fails to find the local Python binary. To solve +this issue, please first find where the python binary is and then add +`--action_env PYTHON_BIN_PATH=` to the Bazel command. For +example, you can switch to use the system default python3 binary by the +following command: + +``` +bazel build -c opt \ + --define MEDIAPIPE_DISABLE_GPU=1 \ + --action_env PYTHON_BIN_PATH=$(which python3) \ + mediapipe/examples/desktop/hello_world +``` + +## Missing necessary Python packages + +The error message: + +``` +ImportError: No module named numpy +Is numpy installed? +``` + +usually indicates that certain Python packages are not installed. Please run +`pip install` or `pip3 install` depending on your Python binary version to +install those packages. + +## Fail to fetch remote dependency repositories + +The error message: + +``` +ERROR: An error occurred during the fetch of repository 'org_tensorflow': + java.io.IOException: Error downloading [https://mirror.bazel.build/github.com/tensorflow/tensorflow/archive/77e9ffb9b2bfb1a4f7056e62d84039626923e328.tar.gz, https://github.com/tensorflow/tensorflow/archive/77e9ffb9b2bfb1a4f7056e62d84039626923e328.tar.gz] to /sandbox_path/external/org_tensorflow/77e9ffb9b2bfb1a4f7056e62d84039626923e328.tar.gz: Tried to reconnect at offset 9,944,151 but server didn't support it + +or + +WARNING: Download from https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_swift/releases/download/0.12.1/rules_swift.0.12.1.tar.gz failed: class java.net.ConnectException Connection timed out (Connection timed out) +``` + +usually indicates that Bazel fails to download necessary dependency repositories +that MediaPipe needs. MediaPipe has several dependency repositories that are +hosted by Google sites. In some regions, you may need to set up a network proxy +or use a VPN to access those resources. You may also need to append +`--host_jvm_args "-DsocksProxyHost= -DsocksProxyPort="` +to the Bazel command. See +[this GitHub issue](https://github.com/google/mediapipe/issues/581#issuecomment-610356857) +for more details. + +If you believe that it's not a network issue, another possibility is that some +resources could be temporarily unavailable, please run `bazel clean --expunge` +and retry it later. If it's still not working, please file a GitHub issue with +the detailed error message. + +## Incorrect MediaPipe OpenCV config + +The error message: + +``` +error: undefined reference to 'cv::String::deallocate()' +error: undefined reference to 'cv::String::allocate(unsigned long)' +error: undefined reference to 'cv::VideoCapture::VideoCapture(cv::String const&)' +... +error: undefined reference to 'cv::putText(cv::InputOutputArray const&, cv::String const&, cv::Point, int, double, cv::Scalar, int, int, bool)' +``` + +usually indicates that OpenCV is not properly configured for MediaPipe. Please +take a look at the "Install OpenCV and FFmpeg" sections in +[Installation](./install.md) to see how to modify MediaPipe's WORKSPACE and +linux_opencv/macos_opencv/windows_opencv.BUILD files for your local opencv +libraries. [This GitHub issue](https://github.com/google/mediapipe/issues/666) +may also help. + +## Python pip install failure + +The error message: + +``` +ERROR: Could not find a version that satisfies the requirement mediapipe +ERROR: No matching distribution found for mediapipe +``` + +after running `pip install mediapipe` usually indicates that there is no qualified MediaPipe Python for your system. +Please note that MediaPipe Python PyPI officially supports the **64-bit** +version of Python 3.7 to 3.10 on the following OS: + +- x86_64 Linux +- x86_64 macOS 10.15+ +- amd64 Windows + +If the OS is currently supported and you still see this error, please make sure +that both the Python and pip binary are for Python 3.7 to 3.10. Otherwise, +please consider building the MediaPipe Python package locally by following the +instructions [here](python.md#building-mediapipe-python-package). + +## Python DLL load failure on Windows + +The error message: + +``` +ImportError: DLL load failed: The specified module could not be found +``` + +usually indicates that the local Windows system is missing Visual C++ +redistributable packages and/or Visual C++ runtime DLLs. This can be solved by +either installing the official +[vc_redist.x64.exe](https://support.microsoft.com/en-us/topic/the-latest-supported-visual-c-downloads-2647da03-1eea-4433-9aff-95f26a218cc0) +or installing the "msvc-runtime" Python package by running + +```bash +$ python -m pip install msvc-runtime +``` + +Please note that the "msvc-runtime" Python package is not released or maintained +by Microsoft. + +## Native method not found + +The error message: + +``` +java.lang.UnsatisfiedLinkError: No implementation found for void com.google.wick.Wick.nativeWick +``` + +usually indicates that a needed native library, such as `/libwickjni.so` has not +been loaded or has not been included in the dependencies of the app or cannot be +found for some reason. Note that Java requires every native library to be +explicitly loaded using the function `System.loadLibrary`. + +## No registered calculator found + +The error message: + +``` +No registered object with name: OurNewCalculator; Unable to find Calculator "OurNewCalculator" +``` + +usually indicates that `OurNewCalculator` is referenced by name in a +[`CalculatorGraphConfig`] but that the library target for OurNewCalculator has +not been linked to the application binary. When a new calculator is added to a +calculator graph, that calculator must also be added as a build dependency of +the applications using the calculator graph. + +This error is caught at runtime because calculator graphs reference their +calculators by name through the field `CalculatorGraphConfig::Node:calculator`. +When the library for a calculator is linked into an application binary, the +calculator is automatically registered by name through the +[`REGISTER_CALCULATOR`] macro using the [`registration.h`] library. Note that +[`REGISTER_CALCULATOR`] can register a calculator with a namespace prefix, +identical to its C++ namespace. In this case, the calculator graph must also use +the same namespace prefix. + +## Out Of Memory error + +Exhausting memory can be a symptom of too many packets accumulating inside a +running MediaPipe graph. This can occur for a number of reasons, such as: + +1. Some calculators in the graph simply can't keep pace with the arrival of + packets from a realtime input stream such as a video camera. +2. Some calculators are waiting for packets that will never arrive. + +For problem (1), it may be necessary to drop some old packets in older to +process the more recent packets. For some hints, see: +[`How to process realtime input streams`]. + +For problem (2), it could be that one input stream is lacking packets for some +reason. A device or a calculator may be misconfigured or may produce packets +only sporadically. This can cause downstream calculators to wait for many +packets that will never arrive, which in turn causes packets to accumulate on +some of their input streams. MediaPipe addresses this sort of problem using +"timestamp bounds". For some hints see: +[`How to process realtime input streams`]. + +The MediaPipe setting [`CalculatorGraphConfig::max_queue_size`] limits the +number of packets enqueued on any input stream by throttling inputs to the +graph. For realtime input streams, the number of packets queued at an input +stream should almost always be zero or one. If this is not the case, you may see +the following warning message: + +``` +Resolved a deadlock by increasing max_queue_size of input stream +``` + +Also, the setting [`CalculatorGraphConfig::report_deadlock`] can be set to cause +graph run to fail and surface the deadlock as an error, such that max_queue_size +to acts as a memory usage limit. + +## Graph hangs + +Many applications will call [`CalculatorGraph::CloseAllPacketSources`] and +[`CalculatorGraph::WaitUntilDone`] to finish or suspend execution of a MediaPipe +graph. The objective here is to allow any pending calculators or packets to +complete processing, and then to shutdown the graph. If all goes well, every +stream in the graph will reach [`Timestamp::Done`], and every calculator will +reach [`CalculatorBase::Close`], and then [`CalculatorGraph::WaitUntilDone`] +will complete successfully. + +If some calculators or streams cannot reach state [`Timestamp::Done`] or +[`CalculatorBase::Close`], then the method [`CalculatorGraph::Cancel`] can be +called to terminate the graph run without waiting for all pending calculators +and packets to complete. + +## Output timing is uneven + +Some realtime MediaPipe graphs produce a series of video frames for viewing as a +video effect or as a video diagnostic. Sometimes, a MediaPipe graph will produce +these frames in clusters, for example when several output frames are +extrapolated from the same cluster of input frames. If the outputs are presented +as they are produced, some output frames are immediately replaced by later +frames in the same cluster, which makes the results hard to see and evaluate +visually. In cases like this, the output visualization can be improved by +presenting the frames at even intervals in real time. + +MediaPipe addresses this use case by mapping timestamps to points in real time. +Each timestamp indicates a time in microseconds, and a calculator such as +`LiveClockSyncCalculator` can delay the output of packets to match their +timestamps. This sort of calculator adjusts the timing of outputs such that: + +1. The time between outputs corresponds to the time between timestamps as + closely as possible. +2. Outputs are produced with the smallest delay possible. + +## CalculatorGraph lags behind inputs + +For many realtime MediaPipe graphs, low latency is an objective. MediaPipe +supports "pipelined" style parallel processing in order to begin processing of +each packet as early as possible. Normally the lowest possible latency is the +total time required by each calculator along a "critical path" of successive +calculators. The latency of the a MediaPipe graph could be worse than the ideal +due to delays introduced to display frames a even intervals as described in +[Output timing is uneven](#output-timing-is-uneven). + +If some of the calculators in the graph cannot keep pace with the realtime input +streams, then latency will continue to increase, and it becomes necessary to +drop some input packets. The recommended technique is to use the MediaPipe +calculators designed specifically for this purpose such as +[`FlowLimiterCalculator`] as described in +[`How to process realtime input streams`]. + +[`CalculatorGraphConfig`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator.proto +[`CalculatorGraphConfig::max_queue_size`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator.proto +[`CalculatorGraphConfig::report_deadlock`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator.proto +[`REGISTER_CALCULATOR`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator_registry.h +[`registration.h`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/deps/registration.h +[`CalculatorGraph::CloseAllPacketSources`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator_graph.h +[`CalculatorGraph::Cancel`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator_graph.h +[`CalculatorGraph::WaitUntilDone`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator_graph.h +[`Timestamp::Done`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/timestamp.h +[`CalculatorBase::Close`]: https://github.com/google/mediapipe/tree/master/mediapipe/framework/calculator_base.h +[`FlowLimiterCalculator`]: https://github.com/google/mediapipe/tree/master/mediapipe/calculators/core/flow_limiter_calculator.cc +[`How to process realtime input streams`]: faq.md#how-to-process-realtime-input-streams diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..e4f5dd1 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,158 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe +title: Home +nav_order: 1 +--- + +---- + +**Attention:** *We have moved to +[https://developers.google.com/mediapipe](https://developers.google.com/mediapipe) +as the primary developer documentation site for MediaPipe as of April 3, 2023.* + +![MediaPipe](https://developers.google.com/static/mediapipe/images/home/hero_01_1920.png) + +**Attention**: MediaPipe Solutions Preview is an early release. [Learn +more](https://developers.google.com/mediapipe/solutions/about#notice). + +**On-device machine learning for everyone** + +Delight your customers with innovative machine learning features. MediaPipe +contains everything that you need to customize and deploy to mobile (Android, +iOS), web, desktop, edge devices, and IoT, effortlessly. + +* [See demos](https://goo.gle/mediapipe-studio) +* [Learn more](https://developers.google.com/mediapipe/solutions) + +## Get started + +You can get started with MediaPipe Solutions by by checking out any of the +developer guides for +[vision](https://developers.google.com/mediapipe/solutions/vision/object_detector), +[text](https://developers.google.com/mediapipe/solutions/text/text_classifier), +and +[audio](https://developers.google.com/mediapipe/solutions/audio/audio_classifier) +tasks. If you need help setting up a development environment for use with +MediaPipe Tasks, check out the setup guides for +[Android](https://developers.google.com/mediapipe/solutions/setup_android), [web +apps](https://developers.google.com/mediapipe/solutions/setup_web), and +[Python](https://developers.google.com/mediapipe/solutions/setup_python). + +## Solutions + +MediaPipe Solutions provides a suite of libraries and tools for you to quickly +apply artificial intelligence (AI) and machine learning (ML) techniques in your +applications. You can plug these solutions into your applications immediately, +customize them to your needs, and use them across multiple development +platforms. MediaPipe Solutions is part of the MediaPipe [open source +project](https://github.com/google/mediapipe), so you can further customize the +solutions code to meet your application needs. + +These libraries and resources provide the core functionality for each MediaPipe +Solution: + +* **MediaPipe Tasks**: Cross-platform APIs and libraries for deploying + solutions. [Learn + more](https://developers.google.com/mediapipe/solutions/tasks). +* **MediaPipe models**: Pre-trained, ready-to-run models for use with each + solution. + +These tools let you customize and evaluate solutions: + +* **MediaPipe Model Maker**: Customize models for solutions with your data. + [Learn more](https://developers.google.com/mediapipe/solutions/model_maker). +* **MediaPipe Studio**: Visualize, evaluate, and benchmark solutions in your + browser. [Learn + more](https://developers.google.com/mediapipe/solutions/studio). + +### Legacy solutions + +We have ended support for [these MediaPipe Legacy Solutions](https://developers.google.com/mediapipe/solutions/guide#legacy) +as of March 1, 2023. All other MediaPipe Legacy Solutions will be upgraded to +a new MediaPipe Solution. See the [Solutions guide](https://developers.google.com/mediapipe/solutions/guide#legacy) +for details. The [code repository](https://github.com/google/mediapipe/tree/master/mediapipe) +and prebuilt binaries for all MediaPipe Legacy Solutions will continue to be +provided on an as-is basis. + +For more on the legacy solutions, see the [documentation](https://github.com/google/mediapipe/tree/master/docs/solutions). + +## Framework + +To start using MediaPipe Framework, [install MediaPipe +Framework](https://developers.google.com/mediapipe/framework/getting_started/install) +and start building example applications in C++, Android, and iOS. + +[MediaPipe Framework](https://developers.google.com/mediapipe/framework) is the +low-level component used to build efficient on-device machine learning +pipelines, similar to the premade MediaPipe Solutions. + +Before using MediaPipe Framework, familiarize yourself with the following key +[Framework +concepts](https://developers.google.com/mediapipe/framework/framework_concepts/overview.md): + +* [Packets](https://developers.google.com/mediapipe/framework/framework_concepts/packets.md) +* [Graphs](https://developers.google.com/mediapipe/framework/framework_concepts/graphs.md) +* [Calculators](https://developers.google.com/mediapipe/framework/framework_concepts/calculators.md) + +## Community + +* [Slack community](https://mediapipe.page.link/joinslack) for MediaPipe + users. +* [Discuss](https://groups.google.com/forum/#!forum/mediapipe) - General + community discussion around MediaPipe. +* [Awesome MediaPipe](https://mediapipe.page.link/awesome-mediapipe) - A + curated list of awesome MediaPipe related frameworks, libraries and + software. + +## Contributing + +We welcome contributions. Please follow these +[guidelines](https://github.com/google/mediapipe/blob/master/CONTRIBUTING.md). + +We use GitHub issues for tracking requests and bugs. Please post questions to +the MediaPipe Stack Overflow with a `mediapipe` tag. + +## Resources + +### Publications + +* [Bringing artworks to life with AR](https://developers.googleblog.com/2021/07/bringing-artworks-to-life-with-ar.html) + in Google Developers Blog +* [Prosthesis control via Mirru App using MediaPipe hand tracking](https://developers.googleblog.com/2021/05/control-your-mirru-prosthesis-with-mediapipe-hand-tracking.html) + in Google Developers Blog +* [SignAll SDK: Sign language interface using MediaPipe is now available for + developers](https://developers.googleblog.com/2021/04/signall-sdk-sign-language-interface-using-mediapipe-now-available.html) + in Google Developers Blog +* [MediaPipe Holistic - Simultaneous Face, Hand and Pose Prediction, on + Device](https://ai.googleblog.com/2020/12/mediapipe-holistic-simultaneous-face.html) + in Google AI Blog +* [Background Features in Google Meet, Powered by Web ML](https://ai.googleblog.com/2020/10/background-features-in-google-meet.html) + in Google AI Blog +* [MediaPipe 3D Face Transform](https://developers.googleblog.com/2020/09/mediapipe-3d-face-transform.html) + in Google Developers Blog +* [Instant Motion Tracking With MediaPipe](https://developers.googleblog.com/2020/08/instant-motion-tracking-with-mediapipe.html) + in Google Developers Blog +* [BlazePose - On-device Real-time Body Pose Tracking](https://ai.googleblog.com/2020/08/on-device-real-time-body-pose-tracking.html) + in Google AI Blog +* [MediaPipe Iris: Real-time Eye Tracking and Depth Estimation](https://ai.googleblog.com/2020/08/mediapipe-iris-real-time-iris-tracking.html) + in Google AI Blog +* [MediaPipe KNIFT: Template-based feature matching](https://developers.googleblog.com/2020/04/mediapipe-knift-template-based-feature-matching.html) + in Google Developers Blog +* [Alfred Camera: Smart camera features using MediaPipe](https://developers.googleblog.com/2020/03/alfred-camera-smart-camera-features-using-mediapipe.html) + in Google Developers Blog +* [Real-Time 3D Object Detection on Mobile Devices with MediaPipe](https://ai.googleblog.com/2020/03/real-time-3d-object-detection-on-mobile.html) + in Google AI Blog +* [AutoFlip: An Open Source Framework for Intelligent Video Reframing](https://ai.googleblog.com/2020/02/autoflip-open-source-framework-for.html) + in Google AI Blog +* [MediaPipe on the Web](https://developers.googleblog.com/2020/01/mediapipe-on-web.html) + in Google Developers Blog +* [Object Detection and Tracking using MediaPipe](https://developers.googleblog.com/2019/12/object-detection-and-tracking-using-mediapipe.html) + in Google Developers Blog +* [On-Device, Real-Time Hand Tracking with MediaPipe](https://ai.googleblog.com/2019/08/on-device-real-time-hand-tracking-with.html) + in Google AI Blog +* [MediaPipe: A Framework for Building Perception Pipelines](https://arxiv.org/abs/1906.08172) + +### Videos + +* [YouTube Channel](https://www.youtube.com/c/MediaPipe) diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..fc7a6f5 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,3 @@ +MediaPipe +===================================== +Please see https://developers.google.com/mediapipe/ diff --git a/docs/solutions/autoflip.md b/docs/solutions/autoflip.md new file mode 100644 index 0000000..a9e1e70 --- /dev/null +++ b/docs/solutions/autoflip.md @@ -0,0 +1,371 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/solutions/guide#legacy +title: AutoFlip (Saliency-aware Video Cropping) +parent: MediaPipe Legacy Solutions +nav_order: 14 +--- + +# AutoFlip: Saliency-aware Video Cropping +{: .no_toc } + +
+ + Table of contents + + {: .text-delta } +1. TOC +{:toc} +
+--- + +**Attention:** *Thank you for your interest in MediaPipe Solutions. +We have ended support for this MediaPipe Legacy Solution as of March 1, 2023. +For more information, see the +[MediaPipe Solutions](https://developers.google.com/mediapipe/solutions/guide#legacy) +site.* + +---- + +## Overview + +AutoFlip is an automatic video cropping pipeline built on top of MediaPipe. This +example focuses on demonstrating how to use AutoFlip to convert an input video +to arbitrary aspect ratios. + +For overall context on AutoFlip, please read this +[Google AI Blog](https://ai.googleblog.com/2020/02/autoflip-open-source-framework-for.html). + +![graph is_required](https://mediapipe.dev/images/autoflip_edited_example.gif) + +## Building + +Run the following command to build the AutoFlip pipeline: + +Note: AutoFlip currently only works with OpenCV 3. Please verify your OpenCV +version beforehand. + +```bash +bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 mediapipe/examples/desktop/autoflip:run_autoflip +``` + +## Running + +```bash +GLOG_logtostderr=1 bazel-bin/mediapipe/examples/desktop/autoflip/run_autoflip \ + --calculator_graph_config_file=mediapipe/examples/desktop/autoflip/autoflip_graph.pbtxt \ + --input_side_packets=input_video_path=/absolute/path/to/the/local/video/file,output_video_path=/absolute/path/to/save/the/output/video/file,aspect_ratio=1:1 +``` + +Use the `aspect_ratio` flag to provide the output aspect ratio. The format +should be `width:height`, where the `width` and `height` are two positive +integers. AutoFlip supports both landscape-to-portrait and portrait-to-landscape +conversions. The pipeline internally compares the target aspect ratio against +the original one, and determines the correct conversion automatically. + +We have put a couple test videos under this +[Google Drive folder](https://drive.google.com/corp/drive/u/0/folders/1KK9LV--Ey0UEVpxssVLhVl7dypgJSQgk). +You could download the videos into your local file system, then modify the +command above accordingly to run AutoFlip against the videos. + +## MediaPipe Graph + +![graph visualization](https://mediapipe.dev/images/autoflip_graph.png) + +To visualize the graph as shown above, copy the text specification of the graph +below and paste it into [MediaPipe Visualizer](https://viz.mediapipe.dev). + +```bash +# Autoflip graph that only renders the final cropped video. For use with +# end user applications. +max_queue_size: -1 + +# VIDEO_PREP: Decodes an input video file into images and a video header. +node { + calculator: "OpenCvVideoDecoderCalculator" + input_side_packet: "INPUT_FILE_PATH:input_video_path" + output_stream: "VIDEO:video_raw" + output_stream: "VIDEO_PRESTREAM:video_header" + output_side_packet: "SAVED_AUDIO_PATH:audio_path" +} + +# VIDEO_PREP: Scale the input video before feature extraction. +node { + calculator: "ScaleImageCalculator" + input_stream: "FRAMES:video_raw" + input_stream: "VIDEO_HEADER:video_header" + output_stream: "FRAMES:video_frames_scaled" + node_options: { + [type.googleapis.com/mediapipe.ScaleImageCalculatorOptions]: { + preserve_aspect_ratio: true + output_format: SRGB + target_width: 480 + algorithm: DEFAULT_WITHOUT_UPSCALE + } + } +} + +# VIDEO_PREP: Create a low frame rate stream for feature extraction. +node { + calculator: "PacketThinnerCalculator" + input_stream: "video_frames_scaled" + output_stream: "video_frames_scaled_downsampled" + node_options: { + [type.googleapis.com/mediapipe.PacketThinnerCalculatorOptions]: { + thinner_type: ASYNC + period: 200000 + } + } +} + +# DETECTION: find borders around the video and major background color. +node { + calculator: "BorderDetectionCalculator" + input_stream: "VIDEO:video_raw" + output_stream: "DETECTED_BORDERS:borders" +} + +# DETECTION: find shot/scene boundaries on the full frame rate stream. +node { + calculator: "ShotBoundaryCalculator" + input_stream: "VIDEO:video_frames_scaled" + output_stream: "IS_SHOT_CHANGE:shot_change" + options { + [type.googleapis.com/mediapipe.autoflip.ShotBoundaryCalculatorOptions] { + min_shot_span: 0.2 + min_motion: 0.3 + window_size: 15 + min_shot_measure: 10 + min_motion_with_shot_measure: 0.05 + } + } +} + +# DETECTION: find faces on the down sampled stream +node { + calculator: "AutoFlipFaceDetectionSubgraph" + input_stream: "VIDEO:video_frames_scaled_downsampled" + output_stream: "DETECTIONS:face_detections" +} +node { + calculator: "FaceToRegionCalculator" + input_stream: "VIDEO:video_frames_scaled_downsampled" + input_stream: "FACES:face_detections" + output_stream: "REGIONS:face_regions" +} + +# DETECTION: find objects on the down sampled stream +node { + calculator: "AutoFlipObjectDetectionSubgraph" + input_stream: "VIDEO:video_frames_scaled_downsampled" + output_stream: "DETECTIONS:object_detections" +} +node { + calculator: "LocalizationToRegionCalculator" + input_stream: "DETECTIONS:object_detections" + output_stream: "REGIONS:object_regions" + options { + [type.googleapis.com/mediapipe.autoflip.LocalizationToRegionCalculatorOptions] { + output_all_signals: true + } + } +} + +# SIGNAL FUSION: Combine detections (with weights) on each frame +node { + calculator: "SignalFusingCalculator" + input_stream: "shot_change" + input_stream: "face_regions" + input_stream: "object_regions" + output_stream: "salient_regions" + options { + [type.googleapis.com/mediapipe.autoflip.SignalFusingCalculatorOptions] { + signal_settings { + type { standard: FACE_CORE_LANDMARKS } + min_score: 0.85 + max_score: 0.9 + is_required: false + } + signal_settings { + type { standard: FACE_ALL_LANDMARKS } + min_score: 0.8 + max_score: 0.85 + is_required: false + } + signal_settings { + type { standard: FACE_FULL } + min_score: 0.8 + max_score: 0.85 + is_required: false + } + signal_settings { + type: { standard: HUMAN } + min_score: 0.75 + max_score: 0.8 + is_required: false + } + signal_settings { + type: { standard: PET } + min_score: 0.7 + max_score: 0.75 + is_required: false + } + signal_settings { + type: { standard: CAR } + min_score: 0.7 + max_score: 0.75 + is_required: false + } + signal_settings { + type: { standard: OBJECT } + min_score: 0.1 + max_score: 0.2 + is_required: false + } + } + } +} + +# CROPPING: make decisions about how to crop each frame. +node { + calculator: "SceneCroppingCalculator" + input_side_packet: "EXTERNAL_ASPECT_RATIO:aspect_ratio" + input_stream: "VIDEO_FRAMES:video_raw" + input_stream: "KEY_FRAMES:video_frames_scaled_downsampled" + input_stream: "DETECTION_FEATURES:salient_regions" + input_stream: "STATIC_FEATURES:borders" + input_stream: "SHOT_BOUNDARIES:shot_change" + output_stream: "CROPPED_FRAMES:cropped_frames" + node_options: { + [type.googleapis.com/mediapipe.autoflip.SceneCroppingCalculatorOptions]: { + max_scene_size: 600 + key_frame_crop_options: { + score_aggregation_type: CONSTANT + } + scene_camera_motion_analyzer_options: { + motion_stabilization_threshold_percent: 0.5 + salient_point_bound: 0.499 + } + padding_parameters: { + blur_cv_size: 200 + overlay_opacity: 0.6 + } + target_size_type: MAXIMIZE_TARGET_DIMENSION + } + } +} + +# ENCODING(required): encode the video stream for the final cropped output. +node { + calculator: "VideoPreStreamCalculator" + # Fetch frame format and dimension from input frames. + input_stream: "FRAME:cropped_frames" + # Copying frame rate and duration from original video. + input_stream: "VIDEO_PRESTREAM:video_header" + output_stream: "output_frames_video_header" +} + +node { + calculator: "OpenCvVideoEncoderCalculator" + input_stream: "VIDEO:cropped_frames" + input_stream: "VIDEO_PRESTREAM:output_frames_video_header" + input_side_packet: "OUTPUT_FILE_PATH:output_video_path" + input_side_packet: "AUDIO_FILE_PATH:audio_path" + node_options: { + [type.googleapis.com/mediapipe.OpenCvVideoEncoderCalculatorOptions]: { + codec: "avc1" + video_format: "mp4" + } + } +} +``` + +## Advanced Parameters + +### Required vs. Best-Effort Saliency Features + +AutoFlip allows users to implement and specify custom features to be used in the +camera trajectory computation. If the user would like to detect and preserve +scenes of lions in a wildlife protection video, for example, they could +implement and add a feature detection calculator for lions into the pipeline. +Refer to `AutoFlipFaceDetectionSubgraph` and `FaceToRegionCalculator`, or +`AutoFlipObjectDetectionSubgraph` and `LocalizationToRegionCalculator` for +examples of how to create new feature detection calculators. + +After adding different feature signals into the graph, use the +`SignalFusingCalculator` node to specify types and weights for different feature +signals. For example, in the graph above, we specified a `face_region` and an +`object_region` input streams, to represent face signals and agnostic object +signals, respectively. + +The larger the weight, the more important the features will be considered when +AutoFlip computes the camera trajectory. Use the `is_required` flag to mark a +feature as a hard constraint, in which case the computed camera trajectory will +try best to cover these feature types in the cropped videos. If for some reason +the required features cannot be all covered (for example, when they are too +spread out in the video), AutoFlip will apply a padding effect to cover as much +salient content as possible. See an illustration below. + +![graph is_required](https://mediapipe.dev/images/autoflip_is_required.gif) + +### Stable vs Tracking Camera Motion + +AutoFlip makes a decision on each scene whether to have the cropped viewpoint +follow an object or if the crop should remain stable (centered on detected +objects). The parameter `motion_stabilization_threshold_percent` value is used +to make the decision to track action or keep the camera stable. If, over the +duration of the scene, all detected focus objects remain within this ratio of +the frame (e.g. 0.5 = 50% or 1920 * .5 = 960 pixels on 1080p video) then the +camera is held steady. Otherwise the camera tracks activity within the frame. + +### Snap To Center + +For some scenes the camera viewpoint will remain stable at the center of +activity (see `motion_stabilization_threshold_percent` setting). In this case, +if the determined best stable viewpoint is within +`snap_center_max_distance_percent` of the frame's center the camera will be +shifted to be locked to the center of the frame. This setting is useful for +videos where the camera operator did a good job already centering content or if +titles and logos are expected to appear in the center of the frame. It may be +less useful on raw content where objects are not already well positioned on +screen. + +### Visualization to Facilitate Debugging + +`SceneCroppingCalculator` provides two extra output streams +`KEY_FRAME_CROP_REGION_VIZ_FRAMES` and `SALIENT_POINT_FRAME_VIZ_FRAMES` to +visualize the cropping window as well as salient points detected on each frame. +You could modify the `SceneCroppingCalculator` node like below to enable these +two output streams. + +```bash +node { + calculator: "SceneCroppingCalculator" + input_side_packet: "EXTERNAL_ASPECT_RATIO:aspect_ratio" + input_stream: "VIDEO_FRAMES:video_raw" + input_stream: "KEY_FRAMES:video_frames_scaled_downsampled" + input_stream: "DETECTION_FEATURES:salient_regions" + input_stream: "STATIC_FEATURES:borders" + input_stream: "SHOT_BOUNDARIES:shot_change" + output_stream: "CROPPED_FRAMES:cropped_frames" + output_stream: "KEY_FRAME_CROP_REGION_VIZ_FRAMES:key_frame_crop_viz_frames" + output_stream: "SALIENT_POINT_FRAME_VIZ_FRAMES:salient_point_viz_frames" + node_options: { + [type.googleapis.com/mediapipe.autoflip.SceneCroppingCalculatorOptions]: { + max_scene_size: 600 + key_frame_crop_options: { + score_aggregation_type: CONSTANT + } + scene_camera_motion_analyzer_options: { + motion_stabilization_threshold_percent: 0.5 + salient_point_bound: 0.499 + } + padding_parameters: { + blur_cv_size: 200 + overlay_opacity: 0.6 + } + target_size_type: MAXIMIZE_TARGET_DIMENSION + } + } +} +``` diff --git a/docs/solutions/box_tracking.md b/docs/solutions/box_tracking.md new file mode 100644 index 0000000..537916a --- /dev/null +++ b/docs/solutions/box_tracking.md @@ -0,0 +1,160 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/solutions/guide#legacy +title: Box Tracking +parent: MediaPipe Legacy Solutions +nav_order: 10 +--- + +# MediaPipe Box Tracking +{: .no_toc } + +
+ + Table of contents + + {: .text-delta } +1. TOC +{:toc} +
+--- + +**Attention:** *Thank you for your interest in MediaPipe Solutions. +We have ended support for this MediaPipe Legacy Solution as of March 1, 2023. +For more information, see the +[MediaPipe Solutions](https://developers.google.com/mediapipe/solutions/guide#legacy) +site.* + +---- + +## Overview + +MediaPipe Box Tracking has been powering real-time tracking in +[Motion Stills](https://ai.googleblog.com/2016/12/get-moving-with-new-motion-stills.html), +[YouTube's privacy blur](https://youtube-creators.googleblog.com/2016/02/blur-moving-objects-in-your-video-with.html), +and [Google Lens](https://lens.google.com/) for several years, leveraging +classic computer vision approaches. + +The box tracking solution consumes image frames from a video or camera stream, +and starting box positions with timestamps, indicating 2D regions of interest to +track, and computes the tracked box positions for each frame. In this specific +use case, the starting box positions come from object detection, but the +starting position can also be provided manually by the user or another system. +Our solution consists of three main components: a motion analysis component, a +flow packager component, and a box tracking component. Each component is +encapsulated as a MediaPipe calculator, and the box tracking solution as a whole +is represented as a MediaPipe +[subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/box_tracking_gpu.pbtxt). + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). + +In the +[box tracking subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/box_tracking_gpu.pbtxt), +the MotionAnalysis calculator extracts features (e.g. high-gradient corners) +across the image, tracks those features over time, classifies them into +foreground and background features, and estimates both local motion vectors and +the global motion model. The FlowPackager calculator packs the estimated motion +metadata into an efficient format. The BoxTracker calculator takes this motion +metadata from the FlowPackager calculator and the position of starting boxes, +and tracks the boxes over time. Using solely the motion data (without the need +for the RGB frames) produced by the MotionAnalysis calculator, the BoxTracker +calculator tracks individual objects or regions while discriminating from +others. Please see +[Object Detection and Tracking using MediaPipe](https://developers.googleblog.com/2019/12/object-detection-and-tracking-using-mediapipe.html) +in Google Developers Blog for more details. + +An advantage of our architecture is that by separating motion analysis into a +dedicated MediaPipe calculator and tracking features over the whole image, we +enable great flexibility and constant computation independent of the number of +regions tracked! By not having to rely on the RGB frames during tracking, our +tracking solution provides the flexibility to cache the metadata across a batch +of frame. Caching enables tracking of regions both backwards and forwards in +time; or even sync directly to a specified timestamp for tracking with random +access. + +## Object Detection and Tracking + +MediaPipe Box Tracking can be paired with ML inference, resulting in valuable +and efficient pipelines. For instance, box tracking can be paired with ML-based +object detection to create an object detection and tracking pipeline. With +tracking, this pipeline offers several advantages over running detection per +frame (e.g., [MediaPipe Object Detection](./object_detection.md)): + +* It provides instance based tracking, i.e. the object ID is maintained across + frames. +* Detection does not have to run every frame. This enables running heavier + detection models that are more accurate while keeping the pipeline + lightweight and real-time on mobile devices. +* Object localization is temporally consistent with the help of tracking, + meaning less jitter is observable across frames. + +![object_tracking_android_gpu.gif](https://mediapipe.dev/images/mobile/object_tracking_android_gpu.gif) | +:----------------------------------------------------------------------------------: | +*Fig 1. Box tracking paired with ML-based object detection.* | + +The object detection and tracking pipeline can be implemented as a MediaPipe +[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/object_detection_tracking_mobile_gpu.pbtxt), +which internally utilizes an +[object detection subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/object_detection_gpu.pbtxt), +an +[object tracking subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/object_tracking_gpu.pbtxt), +and a +[renderer subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/renderer_gpu.pbtxt). + +In general, the object detection subgraph (which performs ML model inference +internally) runs only upon request, e.g. at an arbitrary frame rate or triggered +by specific signals. More specifically, in this particular +[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/object_detection_tracking_mobile_gpu.pbtxt) +a PacketResampler calculator temporally subsamples the incoming video frames to +0.5 fps before they are passed into the object detection subgraph. This frame +rate can be configured differently as an option in PacketResampler. + +The object tracking subgraph runs in real-time on every incoming frame to track +the detected objects. It expands the +[box tracking subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/subgraphs/box_tracking_gpu.pbtxt) +with additional functionality: when new detections arrive it uses IoU +(Intersection over Union) to associate the current tracked objects/boxes with +new detections to remove obsolete or duplicated boxes. + +## Example Apps + +Please first see general instructions for +[Android](../getting_started/android.md), [iOS](../getting_started/ios.md) and +[desktop](../getting_started/cpp.md) on how to build MediaPipe examples. + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +### Mobile + +Note: Object detection is using TensorFlow Lite on GPU while tracking is on CPU. + +* Graph: + [`mediapipe/graphs/tracking/object_detection_tracking_mobile_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/object_detection_tracking_mobile_gpu.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1UXL9jX4Wpp34TsiVogugV3J3T9_C5UK-) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu:objecttrackinggpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/objecttrackinggpu/BUILD) +* iOS target: Not available + +### Desktop + +* Running on CPU (both for object detection using TensorFlow Lite and + tracking): + * Graph: + [`mediapipe/graphs/tracking/object_detection_tracking_desktop_live.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/tracking/object_detection_tracking_desktop_live.pbtxt) + * Target: + [`mediapipe/examples/desktop/object_tracking:object_tracking_cpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/object_tracking/BUILD) +* Running on GPU: Not available + +## Resources + +* Google Developers Blog: + [Object Detection and Tracking using MediaPipe](https://developers.googleblog.com/2019/12/object-detection-and-tracking-using-mediapipe.html) +* Google AI Blog: + [Get moving with the new Motion Stills](https://ai.googleblog.com/2016/12/get-moving-with-new-motion-stills.html) +* YouTube Creator Blog: [Blur moving objects in your video with the new Custom + blurring tool on + YouTube](https://youtube-creators.googleblog.com/2016/02/blur-moving-objects-in-your-video-with.html) diff --git a/docs/solutions/face_detection.md b/docs/solutions/face_detection.md new file mode 100644 index 0000000..93f239c --- /dev/null +++ b/docs/solutions/face_detection.md @@ -0,0 +1,521 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/solutions/vision/face_detector/ +title: Face Detection +parent: MediaPipe Legacy Solutions +nav_order: 1 +--- + +# MediaPipe Face Detection +{: .no_toc } + +
+ + Table of contents + + {: .text-delta } +1. TOC +{:toc} +
+--- + +**Attention:** *Thank you for your interest in MediaPipe Solutions. +As of May 10, 2023, this solution was upgraded to a new MediaPipe +Solution. For more information, see the +[MediaPipe Solutions](https://developers.google.com/mediapipe/solutions/vision/face_detector) +site.* + +---- + +## Overview + +MediaPipe Face Detection is an ultrafast face detection solution that comes with +6 landmarks and multi-face support. It is based on +[BlazeFace](https://arxiv.org/abs/1907.05047), a lightweight and well-performing +face detector tailored for mobile GPU inference. The detector's super-realtime +performance enables it to be applied to any live viewfinder experience that +requires an accurate facial region of interest as an input for other +task-specific models, such as 3D facial keypoint estimation (e.g., +[MediaPipe Face Mesh](./face_mesh.md)), facial features or expression +classification, and face region segmentation. BlazeFace uses a lightweight +feature extraction network inspired by, but distinct from +[MobileNetV1/V2](https://ai.googleblog.com/2018/04/mobilenetv2-next-generation-of-on.html), +a GPU-friendly anchor scheme modified from +[Single Shot MultiBox Detector (SSD)](https://arxiv.org/abs/1512.02325), and an +improved tie resolution strategy alternative to non-maximum suppression. For +more information about BlazeFace, please see the [Resources](#resources) +section. + +![face_detection_android_gpu.gif](https://mediapipe.dev/images/mobile/face_detection_android_gpu.gif) + +## Solution APIs + +### Configuration Options + +Naming style and availability may differ slightly across platforms/languages. + +#### model_selection + +An integer index `0` or `1`. Use `0` to select a short-range model that works +best for faces within 2 meters from the camera, and `1` for a full-range model +best for faces within 5 meters. For the full-range option, a sparse model is +used for its improved inference speed. Please refer to the +[model cards](./models.md#face_detection) for details. Default to `0` if not +specified. + +Note: Not available for JavaScript (use "model" instead). + +#### model + +A string value to indicate which model should be used. Use "short" to +select a short-range model that works best for faces within 2 meters from the +camera, and "full" for a full-range model best for faces within 5 meters. For +the full-range option, a sparse model is used for its improved inference speed. +Please refer to the model cards for details. Default to empty string. + +Note: Valid only for JavaScript solution. + +#### selfie_mode + +A boolean value to indicate whether to flip the images/video frames +horizontally or not. Default to `false`. + +Note: Valid only for JavaScript solution. + +#### min_detection_confidence + +Minimum confidence value (`[0.0, 1.0]`) from the face detection model for the +detection to be considered successful. Default to `0.5`. + +### Output + +Naming style may differ slightly across platforms/languages. + +#### detections + +Collection of detected faces, where each face is represented as a detection +proto message that contains a bounding box and 6 key points (right eye, left +eye, nose tip, mouth center, right ear tragion, and left ear tragion). The +bounding box is composed of `xmin` and `width` (both normalized to `[0.0, 1.0]` +by the image width) and `ymin` and `height` (both normalized to `[0.0, 1.0]` by +the image height). Each key point is composed of `x` and `y`, which are +normalized to `[0.0, 1.0]` by the image width and height respectively. + +### Python Solution API + +Please first follow general [instructions](../getting_started/python.md) to +install MediaPipe Python package, then learn more in the companion +[Python Colab](#resources) and the usage example below. + +Supported configuration options: + +* [model_selection](#model_selection) +* [min_detection_confidence](#min_detection_confidence) + +```python +import cv2 +import mediapipe as mp +mp_face_detection = mp.solutions.face_detection +mp_drawing = mp.solutions.drawing_utils + +# For static images: +IMAGE_FILES = [] +with mp_face_detection.FaceDetection( + model_selection=1, min_detection_confidence=0.5) as face_detection: + for idx, file in enumerate(IMAGE_FILES): + image = cv2.imread(file) + # Convert the BGR image to RGB and process it with MediaPipe Face Detection. + results = face_detection.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + # Draw face detections of each face. + if not results.detections: + continue + annotated_image = image.copy() + for detection in results.detections: + print('Nose tip:') + print(mp_face_detection.get_key_point( + detection, mp_face_detection.FaceKeyPoint.NOSE_TIP)) + mp_drawing.draw_detection(annotated_image, detection) + cv2.imwrite('/tmp/annotated_image' + str(idx) + '.png', annotated_image) + +# For webcam input: +cap = cv2.VideoCapture(0) +with mp_face_detection.FaceDetection( + model_selection=0, min_detection_confidence=0.5) as face_detection: + while cap.isOpened(): + success, image = cap.read() + if not success: + print("Ignoring empty camera frame.") + # If loading a video, use 'break' instead of 'continue'. + continue + + # To improve performance, optionally mark the image as not writeable to + # pass by reference. + image.flags.writeable = False + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + results = face_detection.process(image) + + # Draw the face detection annotations on the image. + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + if results.detections: + for detection in results.detections: + mp_drawing.draw_detection(image, detection) + # Flip the image horizontally for a selfie-view display. + cv2.imshow('MediaPipe Face Detection', cv2.flip(image, 1)) + if cv2.waitKey(5) & 0xFF == 27: + break +cap.release() +``` + +### JavaScript Solution API + +Please first see general [introduction](../getting_started/javascript.md) on +MediaPipe in JavaScript, then learn more in the companion [web demo](#resources) +and the following usage example. + +Supported face detection options: +* [selfieMode](#selfie_mode) +* [model](#model) +* [minDetectionConfidence](#min_detection_confidence) + +```html + + + + + + + + + + + +
+ + +
+ + +``` + +```javascript + +``` + +### Android Solution API + +Please first follow general +[instructions](../getting_started/android_solutions.md) to add MediaPipe Gradle +dependencies and try the Android Solution API in the companion +[example Android Studio project](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/solutions/facedetection), +and learn more in the usage example below. + +Supported configuration options: + +* [staticImageMode](#static_image_mode) +* [modelSelection](#model_selection) + +#### Camera Input + +```java +// For camera input and result rendering with OpenGL. +FaceDetectionOptions faceDetectionOptions = + FaceDetectionOptions.builder() + .setStaticImageMode(false) + .setModelSelection(0).build(); +FaceDetection faceDetection = new FaceDetection(this, faceDetectionOptions); +faceDetection.setErrorListener( + (message, e) -> Log.e(TAG, "MediaPipe Face Detection error:" + message)); + +// Initializes a new CameraInput instance and connects it to MediaPipe Face Detection Solution. +CameraInput cameraInput = new CameraInput(this); +cameraInput.setNewFrameListener( + textureFrame -> faceDetection.send(textureFrame)); + +// Initializes a new GlSurfaceView with a ResultGlRenderer instance +// that provides the interfaces to run user-defined OpenGL rendering code. +// See mediapipe/examples/android/solutions/facedetection/src/main/java/com/google/mediapipe/examples/facedetection/FaceDetectionResultGlRenderer.java +// as an example. +SolutionGlSurfaceView glSurfaceView = + new SolutionGlSurfaceView<>( + this, faceDetection.getGlContext(), faceDetection.getGlMajorVersion()); +glSurfaceView.setSolutionResultRenderer(new FaceDetectionResultGlRenderer()); +glSurfaceView.setRenderInputImage(true); +faceDetection.setResultListener( + faceDetectionResult -> { + if (faceDetectionResult.multiFaceDetections().isEmpty()) { + return; + } + RelativeKeypoint noseTip = + faceDetectionResult + .multiFaceDetections() + .get(0) + .getLocationData() + .getRelativeKeypoints(FaceKeypoint.NOSE_TIP); + Log.i( + TAG, + String.format( + "MediaPipe Face Detection nose tip normalized coordinates (value range: [0, 1]): x=%f, y=%f", + noseTip.getX(), noseTip.getY())); + // Request GL rendering. + glSurfaceView.setRenderData(faceDetectionResult); + glSurfaceView.requestRender(); + }); + +// The runnable to start camera after the GLSurfaceView is attached. +glSurfaceView.post( + () -> + cameraInput.start( + this, + faceDetection.getGlContext(), + CameraInput.CameraFacing.FRONT, + glSurfaceView.getWidth(), + glSurfaceView.getHeight())); +``` + +#### Image Input + +```java +// For reading images from gallery and drawing the output in an ImageView. +FaceDetectionOptions faceDetectionOptions = + FaceDetectionOptions.builder() + .setStaticImageMode(true) + .setModelSelection(0).build(); +FaceDetection faceDetection = new FaceDetection(this, faceDetectionOptions); + +// Connects MediaPipe Face Detection Solution to the user-defined ImageView +// instance that allows users to have the custom drawing of the output landmarks +// on it. See mediapipe/examples/android/solutions/facedetection/src/main/java/com/google/mediapipe/examples/facedetection/FaceDetectionResultImageView.java +// as an example. +FaceDetectionResultImageView imageView = new FaceDetectionResultImageView(this); +faceDetection.setResultListener( + faceDetectionResult -> { + if (faceDetectionResult.multiFaceDetections().isEmpty()) { + return; + } + int width = faceDetectionResult.inputBitmap().getWidth(); + int height = faceDetectionResult.inputBitmap().getHeight(); + RelativeKeypoint noseTip = + faceDetectionResult + .multiFaceDetections() + .get(0) + .getLocationData() + .getRelativeKeypoints(FaceKeypoint.NOSE_TIP); + Log.i( + TAG, + String.format( + "MediaPipe Face Detection nose tip coordinates (pixel values): x=%f, y=%f", + noseTip.getX() * width, noseTip.getY() * height)); + // Request canvas drawing. + imageView.setFaceDetectionResult(faceDetectionResult); + runOnUiThread(() -> imageView.update()); + }); +faceDetection.setErrorListener( + (message, e) -> Log.e(TAG, "MediaPipe Face Detection error:" + message)); + +// ActivityResultLauncher to get an image from the gallery as Bitmap. +ActivityResultLauncher imageGetter = + registerForActivityResult( + new ActivityResultContracts.StartActivityForResult(), + result -> { + Intent resultIntent = result.getData(); + if (resultIntent != null && result.getResultCode() == RESULT_OK) { + Bitmap bitmap = null; + try { + bitmap = + MediaStore.Images.Media.getBitmap( + this.getContentResolver(), resultIntent.getData()); + // Please also rotate the Bitmap based on its orientation. + } catch (IOException e) { + Log.e(TAG, "Bitmap reading error:" + e); + } + if (bitmap != null) { + faceDetection.send(bitmap); + } + } + }); +Intent pickImageIntent = new Intent(Intent.ACTION_PICK); +pickImageIntent.setDataAndType(MediaStore.Images.Media.INTERNAL_CONTENT_URI, "image/*"); +imageGetter.launch(pickImageIntent); +``` + +#### Video Input + +```java +// For video input and result rendering with OpenGL. +FaceDetectionOptions faceDetectionOptions = + FaceDetectionOptions.builder() + .setStaticImageMode(false) + .setModelSelection(0).build(); +FaceDetection faceDetection = new FaceDetection(this, faceDetectionOptions); +faceDetection.setErrorListener( + (message, e) -> Log.e(TAG, "MediaPipe Face Detection error:" + message)); + +// Initializes a new VideoInput instance and connects it to MediaPipe Face Detection Solution. +VideoInput videoInput = new VideoInput(this); +videoInput.setNewFrameListener( + textureFrame -> faceDetection.send(textureFrame)); + +// Initializes a new GlSurfaceView with a ResultGlRenderer instance +// that provides the interfaces to run user-defined OpenGL rendering code. +// See mediapipe/examples/android/solutions/facedetection/src/main/java/com/google/mediapipe/examples/facedetection/FaceDetectionResultGlRenderer.java +// as an example. +SolutionGlSurfaceView glSurfaceView = + new SolutionGlSurfaceView<>( + this, faceDetection.getGlContext(), faceDetection.getGlMajorVersion()); +glSurfaceView.setSolutionResultRenderer(new FaceDetectionResultGlRenderer()); +glSurfaceView.setRenderInputImage(true); + +faceDetection.setResultListener( + faceDetectionResult -> { + if (faceDetectionResult.multiFaceDetections().isEmpty()) { + return; + } + RelativeKeypoint noseTip = + faceDetectionResult + .multiFaceDetections() + .get(0) + .getLocationData() + .getRelativeKeypoints(FaceKeypoint.NOSE_TIP); + Log.i( + TAG, + String.format( + "MediaPipe Face Detection nose tip normalized coordinates (value range: [0, 1]): x=%f, y=%f", + noseTip.getX(), noseTip.getY())); + // Request GL rendering. + glSurfaceView.setRenderData(faceDetectionResult); + glSurfaceView.requestRender(); + }); + +ActivityResultLauncher videoGetter = + registerForActivityResult( + new ActivityResultContracts.StartActivityForResult(), + result -> { + Intent resultIntent = result.getData(); + if (resultIntent != null) { + if (result.getResultCode() == RESULT_OK) { + glSurfaceView.post( + () -> + videoInput.start( + this, + resultIntent.getData(), + faceDetection.getGlContext(), + glSurfaceView.getWidth(), + glSurfaceView.getHeight())); + } + } + }); +Intent pickVideoIntent = new Intent(Intent.ACTION_PICK); +pickVideoIntent.setDataAndType(MediaStore.Video.Media.INTERNAL_CONTENT_URI, "video/*"); +videoGetter.launch(pickVideoIntent); +``` + +## Example Apps + +Please first see general instructions for +[Android](../getting_started/android.md), [iOS](../getting_started/ios.md) and +[desktop](../getting_started/cpp.md) on how to build MediaPipe examples. + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +### Mobile + +#### GPU Pipeline + +* Graph: + [`mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1DZTCy1gp238kkMnu4fUkwI3IrF77Mhy5) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectiongpu:facedetectiongpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectiongpu/BUILD) +* iOS target: + [`mediapipe/examples/ios/facedetectiongpu:FaceDetectionGpuApp`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/facedetectiongpu/BUILD) + +#### CPU Pipeline + +This is very similar to the [GPU pipeline](#gpu-pipeline) except that at the +beginning and the end of the pipeline it performs GPU-to-CPU and CPU-to-GPU +image transfer respectively. As a result, the rest of graph, which shares the +same configuration as the GPU pipeline, runs entirely on CPU. + +* Graph: + [`mediapipe/graphs/face_detection/face_detection_mobile_cpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_detection/face_detection_mobile_cpu.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1npiZY47jbO5m2YaL63o5QoCQs40JC6C7) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectioncpu:facedetectioncpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facedetectioncpu/BUILD) +* iOS target: + [`mediapipe/examples/ios/facedetectioncpu:FaceDetectionCpuApp`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/facedetectioncpu/BUILD) + +### Desktop + +* Running on CPU: + * Graph: + [`mediapipe/graphs/face_detection/face_detection_desktop_live.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_detection/face_detection_desktop_live.pbtxt) + * Target: + [`mediapipe/examples/desktop/face_detection:face_detection_cpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/face_detection/BUILD) +* Running on GPU + * Graph: + [`mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_detection/face_detection_mobile_gpu.pbtxt) + * Target: + [`mediapipe/examples/desktop/face_detection:face_detection_gpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/face_detection/BUILD) + +### Coral + +Please refer to +[these instructions](https://github.com/google/mediapipe/tree/master/mediapipe/examples/coral/README.md) +to cross-compile and run MediaPipe examples on the +[Coral Dev Board](https://coral.ai/products/dev-board). + +## Resources + +* Paper: + [BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs](https://arxiv.org/abs/1907.05047) + ([presentation](https://docs.google.com/presentation/d/1YCtASfnYyZtH-41QvnW5iZxELFnf0MF-pPWSLGj8yjQ/present?slide=id.g5bc8aeffdd_1_0)) + ([poster](https://drive.google.com/file/d/1u6aB6wxDY7X2TmeUUKgFydulNtXkb3pu/view)) +* [Models and model cards](./models.md#face_detection) +* [Web demo](https://code.mediapipe.dev/codepen/face_detection) +* [Python Colab](https://mediapipe.page.link/face_detection_py_colab) diff --git a/docs/solutions/face_mesh.md b/docs/solutions/face_mesh.md new file mode 100644 index 0000000..a859baf --- /dev/null +++ b/docs/solutions/face_mesh.md @@ -0,0 +1,773 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/solutions/vision/face_landmarker/ +title: Face Mesh +parent: MediaPipe Legacy Solutions +nav_order: 2 +--- + +# MediaPipe Face Mesh +{: .no_toc } + +
+ + Table of contents + + {: .text-delta } +1. TOC +{:toc} +
+--- + +**Attention:** *Thank you for your interest in MediaPipe Solutions. +As of May 10, 2023, this solution was upgraded to a new MediaPipe +Solution. For more information, see the +[MediaPipe Solutions](https://developers.google.com/mediapipe/solutions/vision/face_landmarker) +site.* + +---- + +## Overview + +MediaPipe Face Mesh is a solution that estimates 468 3D face landmarks in +real-time even on mobile devices. It employs machine learning (ML) to infer the +3D facial surface, requiring only a single camera input without the need for a +dedicated depth sensor. Utilizing lightweight model architectures together with +GPU acceleration throughout the pipeline, the solution delivers real-time +performance critical for live experiences. + +Additionally, the solution is bundled with the Face Transform module that +bridges the gap between the face landmark estimation and useful real-time +augmented reality (AR) applications. It establishes a metric 3D space and uses +the face landmark screen positions to estimate a face transform within that +space. The face transform data consists of common 3D primitives, including a +face pose transformation matrix and a triangular face mesh. Under the hood, a +lightweight statistical analysis method called +[Procrustes Analysis](https://en.wikipedia.org/wiki/Procrustes_analysis) is +employed to drive a robust, performant and portable logic. The analysis runs on +CPU and has a minimal speed/memory footprint on top of the ML model inference. + +![face_mesh_ar_effects.gif](https://mediapipe.dev/images/face_mesh_ar_effects.gif) | +:-------------------------------------------------------------: | +*Fig 1. AR effects utilizing the 3D facial surface.* | + +## ML Pipeline + +Our ML pipeline consists of two real-time deep neural network models that work +together: A detector that operates on the full image and computes face locations +and a 3D face landmark model that operates on those locations and predicts the +approximate 3D surface via regression. Having the face accurately cropped +drastically reduces the need for common data augmentations like affine +transformations consisting of rotations, translation and scale changes. Instead +it allows the network to dedicate most of its capacity towards coordinate +prediction accuracy. In addition, in our pipeline the crops can also be +generated based on the face landmarks identified in the previous frame, and only +when the landmark model could no longer identify face presence is the face +detector invoked to relocalize the face. This strategy is similar to that +employed in our [MediaPipe Hands](./hands.md) solution, which uses a palm +detector together with a hand landmark model. + +The pipeline is implemented as a MediaPipe +[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_mesh/face_mesh_mobile.pbtxt) +that uses a +[face landmark subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_landmark/face_landmark_front_gpu.pbtxt) +from the +[face landmark module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_landmark), +and renders using a dedicated +[face renderer subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_mesh/subgraphs/face_renderer_gpu.pbtxt). +The +[face landmark subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_landmark/face_landmark_front_gpu.pbtxt) +internally uses a +[face_detection_subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_detection/face_detection_short_range_gpu.pbtxt) +from the +[face detection module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_detection). + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +### Models + +#### Face Detection Model + +The face detector is the same [BlazeFace](https://arxiv.org/abs/1907.05047) +model used in [MediaPipe Face Detection](./face_detection.md). Please refer to +[MediaPipe Face Detection](./face_detection.md) for details. + +#### Face Landmark Model + +For 3D face landmarks we employed transfer learning and trained a network with +several objectives: the network simultaneously predicts 3D landmark coordinates +on synthetic rendered data and 2D semantic contours on annotated real-world +data. The resulting network provided us with reasonable 3D landmark predictions +not just on synthetic but also on real-world data. + +The 3D landmark network receives as input a cropped video frame without +additional depth input. The model outputs the positions of the 3D points, as +well as the probability of a face being present and reasonably aligned in the +input. A common alternative approach is to predict a 2D heatmap for each +landmark, but it is not amenable to depth prediction and has high computational +costs for so many points. We further improve the accuracy and robustness of our +model by iteratively bootstrapping and refining predictions. That way we can +grow our dataset to increasingly challenging cases, such as grimaces, oblique +angle and occlusions. + +You can find more information about the face landmark model in this +[paper](https://arxiv.org/abs/1907.06724). + +![face_mesh_android_gpu.gif](https://mediapipe.dev/images/mobile/face_mesh_android_gpu.gif) | +:------------------------------------------------------------------------: | +*Fig 2. Face landmarks: the red box indicates the cropped area as input to the landmark model, the red dots represent the 468 landmarks in 3D, and the green lines connecting landmarks illustrate the contours around the eyes, eyebrows, lips and the entire face.* | + +#### Attention Mesh Model + +In addition to the [Face Landmark Model](#face-landmark-model) we provide +another model that applies +[attention](https://en.wikipedia.org/wiki/Attention_(machine_learning)) to +semantically meaningful face regions, and therefore predicting landmarks more +accurately around lips, eyes and irises, at the expense of more compute. It +enables applications like AR makeup and AR puppeteering. + +The attention mesh model can be selected in the Solution APIs via the +[refine_landmarks](#refine_landmarks) option. You can also find more information +about the model in this [paper](https://arxiv.org/abs/2006.10962). + +![attention_mesh_architecture.png](https://mediapipe.dev/images/attention_mesh_architecture.png) | +:---------------------------------------------------------------------------: | +*Fig 3. Attention Mesh: Overview of model architecture.* | + +## Face Transform Module + +The [Face Landmark Model](#face-landmark-model) performs a single-camera face landmark +detection in the screen coordinate space: the X- and Y- coordinates are +normalized screen coordinates, while the Z coordinate is relative and is scaled +as the X coordinate under the +[weak perspective projection camera model](https://en.wikipedia.org/wiki/3D_projection#Weak_perspective_projection). +This format is well-suited for some applications, however it does not directly +enable the full spectrum of augmented reality (AR) features like aligning a +virtual 3D object with a detected face. + +The +[Face Transform module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_geometry) +moves away from the screen coordinate space towards a metric 3D space and +provides necessary primitives to handle a detected face as a regular 3D object. +By design, you'll be able to use a perspective camera to project the final 3D +scene back into the screen coordinate space with a guarantee that the face +landmark positions are not changed. + +### Key Concepts + +#### Metric 3D Space + +The **Metric 3D space** established within the Face Transform module is a +right-handed orthonormal metric 3D coordinate space. Within the space, there is +a **virtual perspective camera** located at the space origin and pointed in the +negative direction of the Z-axis. In the current pipeline, it is assumed that +the input camera frames are observed by exactly this virtual camera and +therefore its parameters are later used to convert the screen landmark +coordinates back into the Metric 3D space. The *virtual camera parameters* can +be set freely, however for better results it is advised to set them as close to +the *real physical camera parameters* as possible. + +![face_geometry_metric_3d_space.gif](https://mediapipe.dev/images/face_geometry_metric_3d_space.gif) | +:-------------------------------------------------------------------------------: | +*Fig 4. A visualization of multiple key elements in the Metric 3D space.* | + +#### Canonical Face Model + +The **Canonical Face Model** is a static 3D model of a human face, which follows +the 468 3D face landmark topology of the +[Face Landmark Model](#face-landmark-model). The model bears two important +functions: + +- **Defines metric units**: the scale of the canonical face model defines the + metric units of the Metric 3D space. A metric unit used by the + [default canonical face model](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_geometry/data/canonical_face_model.fbx) + is a centimeter; +- **Bridges static and runtime spaces**: the face pose transformation matrix + is - in fact - a linear map from the canonical face model into the runtime + face landmark set estimated on each frame. This way, virtual 3D assets + modeled around the canonical face model can be aligned with a tracked face + by applying the face pose transformation matrix to them. + +### Components + +#### Transform Pipeline + +The **Transform Pipeline** is a key component, which is responsible for +estimating the face transform objects within the Metric 3D space. On each frame, +the following steps are executed in the given order: + +- Face landmark screen coordinates are converted into the Metric 3D space + coordinates; +- Face pose transformation matrix is estimated as a rigid linear mapping from + the canonical face metric landmark set into the runtime face metric landmark + set in a way that minimizes a difference between the two; +- A face mesh is created using the runtime face metric landmarks as the vertex + positions (XYZ), while both the vertex texture coordinates (UV) and the + triangular topology are inherited from the canonical face model. + +The transform pipeline is implemented as a MediaPipe +[calculator](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_geometry/geometry_pipeline_calculator.cc). +For your convenience, this calculator is bundled together with corresponding +metadata into a unified MediaPipe +[subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_geometry/face_geometry_from_landmarks.pbtxt). +The face transform format is defined as a Protocol Buffer +[message](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_geometry/protos/face_geometry.proto). + +#### Effect Renderer + +The **Effect Renderer** is a component, which serves as a working example of a +face effect renderer. It targets the *OpenGL ES 2.0* API to enable a real-time +performance on mobile devices and supports the following rendering modes: + +- **3D object rendering mode**: a virtual object is aligned with a detected + face to emulate an object attached to the face (example: glasses); +- **Face mesh rendering mode**: a texture is stretched on top of the face mesh + surface to emulate a face painting technique. + +In both rendering modes, the face mesh is first rendered as an occluder straight +into the depth buffer. This step helps to create a more believable effect via +hiding invisible elements behind the face surface. + +The effect renderer is implemented as a MediaPipe +[calculator](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_geometry/effect_renderer_calculator.cc). + +| ![face_geometry_renderer.gif](https://mediapipe.dev/images/face_geometry_renderer.gif) | +| :---------------------------------------------------------------------: | +| *Fig 5. An example of face effects rendered by the Face Transform Effect Renderer.* | + +## Solution APIs + +### Configuration Options + +Naming style and availability may differ slightly across platforms/languages. + +#### static_image_mode + +If set to `false`, the solution treats the input images as a video stream. It +will try to detect faces in the first input images, and upon a successful +detection further localizes the face landmarks. In subsequent images, once all +[max_num_faces](#max_num_faces) faces are detected and the corresponding face +landmarks are localized, it simply tracks those landmarks without invoking +another detection until it loses track of any of the faces. This reduces latency +and is ideal for processing video frames. If set to `true`, face detection runs +on every input image, ideal for processing a batch of static, possibly +unrelated, images. Default to `false`. + +#### max_num_faces + +Maximum number of faces to detect. Default to `1`. + +#### refine_landmarks + +Whether to further refine the landmark coordinates around the eyes and lips, and +output additional landmarks around the irises by applying the +[Attention Mesh Model](#attention-mesh-model). Default to `false`. + +#### min_detection_confidence + +Minimum confidence value (`[0.0, 1.0]`) from the face detection model for the +detection to be considered successful. Default to `0.5`. + +#### min_tracking_confidence + +Minimum confidence value (`[0.0, 1.0]`) from the landmark-tracking model for the +face landmarks to be considered tracked successfully, or otherwise face +detection will be invoked automatically on the next input image. Setting it to a +higher value can increase robustness of the solution, at the expense of a higher +latency. Ignored if [static_image_mode](#static_image_mode) is `true`, where +face detection simply runs on every image. Default to `0.5`. + +### Output + +Naming style may differ slightly across platforms/languages. + +#### multi_face_landmarks + +Collection of detected/tracked faces, where each face is represented as a list +of 468 face landmarks and each landmark is composed of `x`, `y` and `z`. `x` and +`y` are normalized to `[0.0, 1.0]` by the image width and height respectively. +`z` represents the landmark depth with the depth at center of the head being the +origin, and the smaller the value the closer the landmark is to the camera. The +magnitude of `z` uses roughly the same scale as `x`. + +### Python Solution API + +Please first follow general [instructions](../getting_started/python.md) to +install MediaPipe Python package, then learn more in the companion +[Python Colab](#resources) and the usage example below. + +Supported configuration options: + +* [static_image_mode](#static_image_mode) +* [max_num_faces](#max_num_faces) +* [refine_landmarks](#refine_landmarks) +* [min_detection_confidence](#min_detection_confidence) +* [min_tracking_confidence](#min_tracking_confidence) + +```python +import cv2 +import mediapipe as mp +mp_drawing = mp.solutions.drawing_utils +mp_drawing_styles = mp.solutions.drawing_styles +mp_face_mesh = mp.solutions.face_mesh + +# For static images: +IMAGE_FILES = [] +drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) +with mp_face_mesh.FaceMesh( + static_image_mode=True, + max_num_faces=1, + refine_landmarks=True, + min_detection_confidence=0.5) as face_mesh: + for idx, file in enumerate(IMAGE_FILES): + image = cv2.imread(file) + # Convert the BGR image to RGB before processing. + results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + # Print and draw face mesh landmarks on the image. + if not results.multi_face_landmarks: + continue + annotated_image = image.copy() + for face_landmarks in results.multi_face_landmarks: + print('face_landmarks:', face_landmarks) + mp_drawing.draw_landmarks( + image=annotated_image, + landmark_list=face_landmarks, + connections=mp_face_mesh.FACEMESH_TESSELATION, + landmark_drawing_spec=None, + connection_drawing_spec=mp_drawing_styles + .get_default_face_mesh_tesselation_style()) + mp_drawing.draw_landmarks( + image=annotated_image, + landmark_list=face_landmarks, + connections=mp_face_mesh.FACEMESH_CONTOURS, + landmark_drawing_spec=None, + connection_drawing_spec=mp_drawing_styles + .get_default_face_mesh_contours_style()) + mp_drawing.draw_landmarks( + image=annotated_image, + landmark_list=face_landmarks, + connections=mp_face_mesh.FACEMESH_IRISES, + landmark_drawing_spec=None, + connection_drawing_spec=mp_drawing_styles + .get_default_face_mesh_iris_connections_style()) + cv2.imwrite('/tmp/annotated_image' + str(idx) + '.png', annotated_image) + +# For webcam input: +drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1) +cap = cv2.VideoCapture(0) +with mp_face_mesh.FaceMesh( + max_num_faces=1, + refine_landmarks=True, + min_detection_confidence=0.5, + min_tracking_confidence=0.5) as face_mesh: + while cap.isOpened(): + success, image = cap.read() + if not success: + print("Ignoring empty camera frame.") + # If loading a video, use 'break' instead of 'continue'. + continue + + # To improve performance, optionally mark the image as not writeable to + # pass by reference. + image.flags.writeable = False + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + results = face_mesh.process(image) + + # Draw the face mesh annotations on the image. + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + if results.multi_face_landmarks: + for face_landmarks in results.multi_face_landmarks: + mp_drawing.draw_landmarks( + image=image, + landmark_list=face_landmarks, + connections=mp_face_mesh.FACEMESH_TESSELATION, + landmark_drawing_spec=None, + connection_drawing_spec=mp_drawing_styles + .get_default_face_mesh_tesselation_style()) + mp_drawing.draw_landmarks( + image=image, + landmark_list=face_landmarks, + connections=mp_face_mesh.FACEMESH_CONTOURS, + landmark_drawing_spec=None, + connection_drawing_spec=mp_drawing_styles + .get_default_face_mesh_contours_style()) + mp_drawing.draw_landmarks( + image=image, + landmark_list=face_landmarks, + connections=mp_face_mesh.FACEMESH_IRISES, + landmark_drawing_spec=None, + connection_drawing_spec=mp_drawing_styles + .get_default_face_mesh_iris_connections_style()) + # Flip the image horizontally for a selfie-view display. + cv2.imshow('MediaPipe Face Mesh', cv2.flip(image, 1)) + if cv2.waitKey(5) & 0xFF == 27: + break +cap.release() +``` + +### JavaScript Solution API + +Please first see general [introduction](../getting_started/javascript.md) on +MediaPipe in JavaScript, then learn more in the companion [web demo](#resources) +and the following usage example. + +Supported configuration options: + +* [maxNumFaces](#max_num_faces) +* [refineLandmarks](#refine_landmarks) +* [minDetectionConfidence](#min_detection_confidence) +* [minTrackingConfidence](#min_tracking_confidence) + +```html + + + + + + + + + + + +
+ + +
+ + +``` + +```javascript + +``` + +### Android Solution API + +Please first follow general +[instructions](../getting_started/android_solutions.md) to add MediaPipe Gradle +dependencies and try the Android Solution API in the companion +[example Android Studio project](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/solutions/facemesh), +and learn more in the usage example below. + +Supported configuration options: + +* [staticImageMode](#static_image_mode) +* [maxNumFaces](#max_num_faces) +* [refineLandmarks](#refine_landmarks) +* runOnGpu: Run the pipeline and the model inference on GPU or CPU. + +#### Camera Input + +```java +// For camera input and result rendering with OpenGL. +FaceMeshOptions faceMeshOptions = + FaceMeshOptions.builder() + .setStaticImageMode(false) + .setRefineLandmarks(true) + .setMaxNumFaces(1) + .setRunOnGpu(true).build(); +FaceMesh faceMesh = new FaceMesh(this, faceMeshOptions); +faceMesh.setErrorListener( + (message, e) -> Log.e(TAG, "MediaPipe Face Mesh error:" + message)); + +// Initializes a new CameraInput instance and connects it to MediaPipe Face Mesh Solution. +CameraInput cameraInput = new CameraInput(this); +cameraInput.setNewFrameListener( + textureFrame -> faceMesh.send(textureFrame)); + +// Initializes a new GlSurfaceView with a ResultGlRenderer instance +// that provides the interfaces to run user-defined OpenGL rendering code. +// See mediapipe/examples/android/solutions/facemesh/src/main/java/com/google/mediapipe/examples/facemesh/FaceMeshResultGlRenderer.java +// as an example. +SolutionGlSurfaceView glSurfaceView = + new SolutionGlSurfaceView<>( + this, faceMesh.getGlContext(), faceMesh.getGlMajorVersion()); +glSurfaceView.setSolutionResultRenderer(new FaceMeshResultGlRenderer()); +glSurfaceView.setRenderInputImage(true); + +faceMesh.setResultListener( + faceMeshResult -> { + NormalizedLandmark noseLandmark = + result.multiFaceLandmarks().get(0).getLandmarkList().get(1); + Log.i( + TAG, + String.format( + "MediaPipe Face Mesh nose normalized coordinates (value range: [0, 1]): x=%f, y=%f", + noseLandmark.getX(), noseLandmark.getY())); + // Request GL rendering. + glSurfaceView.setRenderData(faceMeshResult); + glSurfaceView.requestRender(); + }); + +// The runnable to start camera after the GLSurfaceView is attached. +glSurfaceView.post( + () -> + cameraInput.start( + this, + faceMesh.getGlContext(), + CameraInput.CameraFacing.FRONT, + glSurfaceView.getWidth(), + glSurfaceView.getHeight())); +``` + +#### Image Input + +```java +// For reading images from gallery and drawing the output in an ImageView. +FaceMeshOptions faceMeshOptions = + FaceMeshOptions.builder() + .setStaticImageMode(true) + .setRefineLandmarks(true) + .setMaxNumFaces(1) + .setRunOnGpu(true).build(); +FaceMesh faceMesh = new FaceMesh(this, faceMeshOptions); + +// Connects MediaPipe Face Mesh Solution to the user-defined ImageView instance +// that allows users to have the custom drawing of the output landmarks on it. +// See mediapipe/examples/android/solutions/facemesh/src/main/java/com/google/mediapipe/examples/facemesh/FaceMeshResultImageView.java +// as an example. +FaceMeshResultImageView imageView = new FaceMeshResultImageView(this); +faceMesh.setResultListener( + faceMeshResult -> { + int width = faceMeshResult.inputBitmap().getWidth(); + int height = faceMeshResult.inputBitmap().getHeight(); + NormalizedLandmark noseLandmark = + result.multiFaceLandmarks().get(0).getLandmarkList().get(1); + Log.i( + TAG, + String.format( + "MediaPipe Face Mesh nose coordinates (pixel values): x=%f, y=%f", + noseLandmark.getX() * width, noseLandmark.getY() * height)); + // Request canvas drawing. + imageView.setFaceMeshResult(faceMeshResult); + runOnUiThread(() -> imageView.update()); + }); +faceMesh.setErrorListener( + (message, e) -> Log.e(TAG, "MediaPipe Face Mesh error:" + message)); + +// ActivityResultLauncher to get an image from the gallery as Bitmap. +ActivityResultLauncher imageGetter = + registerForActivityResult( + new ActivityResultContracts.StartActivityForResult(), + result -> { + Intent resultIntent = result.getData(); + if (resultIntent != null && result.getResultCode() == RESULT_OK) { + Bitmap bitmap = null; + try { + bitmap = + MediaStore.Images.Media.getBitmap( + this.getContentResolver(), resultIntent.getData()); + // Please also rotate the Bitmap based on its orientation. + } catch (IOException e) { + Log.e(TAG, "Bitmap reading error:" + e); + } + if (bitmap != null) { + faceMesh.send(bitmap); + } + } + }); +Intent pickImageIntent = new Intent(Intent.ACTION_PICK); +pickImageIntent.setDataAndType(MediaStore.Images.Media.INTERNAL_CONTENT_URI, "image/*"); +imageGetter.launch(pickImageIntent); +``` + +#### Video Input + +```java +// For video input and result rendering with OpenGL. +FaceMeshOptions faceMeshOptions = + FaceMeshOptions.builder() + .setStaticImageMode(false) + .setRefineLandmarks(true) + .setMaxNumFaces(1) + .setRunOnGpu(true).build(); +FaceMesh faceMesh = new FaceMesh(this, faceMeshOptions); +faceMesh.setErrorListener( + (message, e) -> Log.e(TAG, "MediaPipe Face Mesh error:" + message)); + +// Initializes a new VideoInput instance and connects it to MediaPipe Face Mesh Solution. +VideoInput videoInput = new VideoInput(this); +videoInput.setNewFrameListener( + textureFrame -> faceMesh.send(textureFrame)); + +// Initializes a new GlSurfaceView with a ResultGlRenderer instance +// that provides the interfaces to run user-defined OpenGL rendering code. +// See mediapipe/examples/android/solutions/facemesh/src/main/java/com/google/mediapipe/examples/facemesh/FaceMeshResultGlRenderer.java +// as an example. +SolutionGlSurfaceView glSurfaceView = + new SolutionGlSurfaceView<>( + this, faceMesh.getGlContext(), faceMesh.getGlMajorVersion()); +glSurfaceView.setSolutionResultRenderer(new FaceMeshResultGlRenderer()); +glSurfaceView.setRenderInputImage(true); + +faceMesh.setResultListener( + faceMeshResult -> { + NormalizedLandmark noseLandmark = + result.multiFaceLandmarks().get(0).getLandmarkList().get(1); + Log.i( + TAG, + String.format( + "MediaPipe Face Mesh nose normalized coordinates (value range: [0, 1]): x=%f, y=%f", + noseLandmark.getX(), noseLandmark.getY())); + // Request GL rendering. + glSurfaceView.setRenderData(faceMeshResult); + glSurfaceView.requestRender(); + }); + +ActivityResultLauncher videoGetter = + registerForActivityResult( + new ActivityResultContracts.StartActivityForResult(), + result -> { + Intent resultIntent = result.getData(); + if (resultIntent != null) { + if (result.getResultCode() == RESULT_OK) { + glSurfaceView.post( + () -> + videoInput.start( + this, + resultIntent.getData(), + faceMesh.getGlContext(), + glSurfaceView.getWidth(), + glSurfaceView.getHeight())); + } + } + }); +Intent pickVideoIntent = new Intent(Intent.ACTION_PICK); +pickVideoIntent.setDataAndType(MediaStore.Video.Media.INTERNAL_CONTENT_URI, "video/*"); +videoGetter.launch(pickVideoIntent); +``` + +## Example Apps + +Please first see general instructions for +[Android](../getting_started/android.md), [iOS](../getting_started/ios.md) and +[desktop](../getting_started/cpp.md) on how to build MediaPipe examples. + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +### Face Landmark Example + +Face landmark example showcases real-time, cross-platform face landmark +detection. For visual reference, please refer to *Fig. 2*. + +#### Mobile + +* Graph: + [`mediapipe/graphs/face_mesh/face_mesh_mobile.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_mesh/face_mesh_mobile.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1pUmd7CXCL_onYMbsZo5p91cH0oNnR4gi) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/facemeshgpu:facemeshgpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facemeshgpu/BUILD) +* iOS target: + [`mediapipe/examples/ios/facemeshgpu:FaceMeshGpuApp`](http:/mediapipe/examples/ios/facemeshgpu/BUILD) + +Tip: Maximum number of faces to detect/process is set to 1 by default. To change +it, for Android modify `NUM_FACES` in +[MainActivity.java](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/facemeshgpu/MainActivity.java), +and for iOS modify `kNumFaces` in +[FaceMeshGpuViewController.mm](https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/facemeshgpu/FaceMeshGpuViewController.mm). + +#### Desktop + +* Running on CPU + * Graph: + [`mediapipe/graphs/face_mesh/face_mesh_desktop_live.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_mesh/face_mesh_desktop_live.pbtxt) + * Target: + [`mediapipe/examples/desktop/face_mesh:face_mesh_cpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/face_mesh/BUILD) +* Running on GPU + * Graph: + [`mediapipe/graphs/face_mesh/face_mesh_desktop_live_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_mesh/face_mesh_desktop_live_gpu.pbtxt) + * Target: + [`mediapipe/examples/desktop/face_mesh:face_mesh_gpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/face_mesh/BUILD) + +Tip: Maximum number of faces to detect/process is set to 1 by default. To change +it, in the graph file modify the option of `ConstantSidePacketCalculator`. + +### Face Effect Example + +Face effect example showcases real-time mobile face effect application use case +for the Face Mesh solution. To enable a better user experience, this example +only works for a single face. For visual reference, please refer to *Fig. 4*. + +#### Mobile + +* Graph: + [`mediapipe/graphs/face_effect/face_effect_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/face_effect/face_effect_gpu.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/file/d/1ccnaDnffEuIXriBZr2SK_Eu4FpO7K44s) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/faceeffect`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/faceeffect/BUILD) +* iOS target: + [`mediapipe/examples/ios/faceeffect`](http:/mediapipe/examples/ios/faceeffect/BUILD) + +## Resources + +* Google AI Blog: + [Real-Time AR Self-Expression with Machine Learning](https://ai.googleblog.com/2019/03/real-time-ar-self-expression-with.html) +* TensorFlow Blog: + [Face and hand tracking in the browser with MediaPipe and TensorFlow.js](https://blog.tensorflow.org/2020/03/face-and-hand-tracking-in-browser-with-mediapipe-and-tensorflowjs.html) +* Google Developers Blog: + [MediaPipe 3D Face Transform](https://developers.googleblog.com/2020/09/mediapipe-3d-face-transform.html) +* Paper: + [Real-time Facial Surface Geometry from Monocular Video on Mobile GPUs](https://arxiv.org/abs/1907.06724) + ([poster](https://docs.google.com/presentation/d/1-LWwOMO9TzEVdrZ1CS1ndJzciRHfYDJfbSxH_ke_JRg/present?slide=id.g5986dd4b4c_4_212)) +* Canonical face model: + [FBX](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_geometry/data/canonical_face_model.fbx), + [OBJ](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_geometry/data/canonical_face_model.obj), + [UV visualization](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualization.png) +* [Models and model cards](./models.md#face_mesh) +* [Web demo](https://code.mediapipe.dev/codepen/face_mesh) +* [Python Colab](https://mediapipe.page.link/face_mesh_py_colab) diff --git a/docs/solutions/hair_segmentation.md b/docs/solutions/hair_segmentation.md new file mode 100644 index 0000000..feb40f9 --- /dev/null +++ b/docs/solutions/hair_segmentation.md @@ -0,0 +1,78 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/solutions/vision/image_segmenter/ +title: Hair Segmentation +parent: MediaPipe Legacy Solutions +nav_order: 8 +--- + +# MediaPipe Hair Segmentation +{: .no_toc } + +
+ + Table of contents + + {: .text-delta } +1. TOC +{:toc} +
+--- + +**Attention:** *Thank you for your interest in MediaPipe Solutions. +As of April 4, 2023, this solution was upgraded to a new MediaPipe +Solution. For more information, see the +[MediaPipe Solutions](https://developers.google.com/mediapipe/solutions/vision/image_segmenter/) +site.* + +---- + +![hair_segmentation_android_gpu_gif](https://mediapipe.dev/images/mobile/hair_segmentation_android_gpu.gif) + +## Example Apps + +Please first see general instructions for +[Android](../getting_started/android.md), [iOS](../getting_started/ios.md) and +[desktop](../getting_started/cpp.md) on how to build MediaPipe examples. + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +### Mobile + +* Graph: + [`mediapipe/graphs/hair_segmentation/hair_segmentation_mobile_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hair_segmentation/hair_segmentation_mobile_gpu.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1mmLtyL8IRfCUbqqu0-E-Hgjr_e6P3XAy) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/hairsegmentationgpu:hairsegmentationgpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/hairsegmentationgpu/BUILD) +* iOS target: Not available + +### Desktop + +* Running on CPU: Not available +* Running on GPU + * Graph: + [`mediapipe/graphs/hair_segmentation/hair_segmentation_mobile_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hair_segmentation/hair_segmentation_mobile_gpu.pbtxt) + * Target: + [`mediapipe/examples/desktop/hair_segmentation:hair_segmentation_gpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/hair_segmentation/BUILD) + +### Web + +Use [this link](https://viz.mediapipe.dev/demo/hair_segmentation) to load a demo +in the MediaPipe Visualizer, and over there click the "Runner" icon in the top +bar like shown below. The demos use your webcam video as input, which is +processed all locally in real-time and never leaves your device. Please see +[MediaPipe on the Web](https://developers.googleblog.com/2020/01/mediapipe-on-web.html) +in Google Developers Blog for details. + +![visualizer_runner](https://mediapipe.dev/images/visualizer_runner.png) + +## Resources + +* Paper: + [Real-time Hair segmentation and recoloring on Mobile GPUs](https://arxiv.org/abs/1907.06740) + ([presentation](https://drive.google.com/file/d/1C8WYlWdDRNtU1_pYBvkkG5Z5wqYqf0yj/view)) + ([supplementary video](https://drive.google.com/file/d/1LPtM99Ch2ogyXYbDNpEqnUfhFq0TfLuf/view)) +* [Models and model cards](./models.md#hair_segmentation) diff --git a/docs/solutions/hands.md b/docs/solutions/hands.md new file mode 100644 index 0000000..6cf2264 --- /dev/null +++ b/docs/solutions/hands.md @@ -0,0 +1,671 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/solutions/vision/hand_landmarker +title: Hands +parent: MediaPipe Legacy Solutions +nav_order: 4 +--- + +# MediaPipe Hands +{: .no_toc } + +
+ + Table of contents + + {: .text-delta } +1. TOC +{:toc} +
+--- + +**Attention:** *Thank you for your interest in MediaPipe Solutions. +As of March 1, 2023, this solution was upgraded to a new MediaPipe +Solution. For more information, see the +[MediaPipe Solutions](https://developers.google.com/mediapipe/solutions/vision/hand_landmarker) +site.* + +---- + +## Overview + +The ability to perceive the shape and motion of hands can be a vital component +in improving the user experience across a variety of technological domains and +platforms. For example, it can form the basis for sign language understanding +and hand gesture control, and can also enable the overlay of digital content and +information on top of the physical world in augmented reality. While coming +naturally to people, robust real-time hand perception is a decidedly challenging +computer vision task, as hands often occlude themselves or each other (e.g. +finger/palm occlusions and hand shakes) and lack high contrast patterns. + +MediaPipe Hands is a high-fidelity hand and finger tracking solution. It employs +machine learning (ML) to infer 21 3D landmarks of a hand from just a single +frame. Whereas current state-of-the-art approaches rely primarily on powerful +desktop environments for inference, our method achieves real-time performance on +a mobile phone, and even scales to multiple hands. We hope that providing this +hand perception functionality to the wider research and development community +will result in an emergence of creative use cases, stimulating new applications +and new research avenues. + +![hand_tracking_3d_android_gpu.gif](https://mediapipe.dev/images/mobile/hand_tracking_3d_android_gpu.gif) | +:------------------------------------------------------------------------------------: | +*Fig 1. Tracked 3D hand landmarks are represented by dots in different shades, with the brighter ones denoting landmarks closer to the camera.* | + +## ML Pipeline + +MediaPipe Hands utilizes an ML pipeline consisting of multiple models working +together: A palm detection model that operates on the full image and returns an +oriented hand bounding box. A hand landmark model that operates on the cropped +image region defined by the palm detector and returns high-fidelity 3D hand +keypoints. This strategy is similar to that employed in our +[MediaPipe Face Mesh](./face_mesh.md) solution, which uses a face detector +together with a face landmark model. + +Providing the accurately cropped hand image to the hand landmark model +drastically reduces the need for data augmentation (e.g. rotations, translation +and scale) and instead allows the network to dedicate most of its capacity +towards coordinate prediction accuracy. In addition, in our pipeline the crops +can also be generated based on the hand landmarks identified in the previous +frame, and only when the landmark model could no longer identify hand presence +is palm detection invoked to relocalize the hand. + +The pipeline is implemented as a MediaPipe +[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt) +that uses a +[hand landmark tracking subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark_tracking_gpu.pbtxt) +from the +[hand landmark module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark), +and renders using a dedicated +[hand renderer subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/subgraphs/hand_renderer_gpu.pbtxt). +The +[hand landmark tracking subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark_tracking_gpu.pbtxt) +internally uses a +[hand landmark subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark/hand_landmark_gpu.pbtxt) +from the same module and a +[palm detection subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/palm_detection/palm_detection_gpu.pbtxt) +from the +[palm detection module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/palm_detection). + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +## Models + +### Palm Detection Model + +To detect initial hand locations, we designed a +[single-shot detector](https://arxiv.org/abs/1512.02325) model optimized for +mobile real-time uses in a manner similar to the face detection model in +[MediaPipe Face Mesh](./face_mesh.md). Detecting hands is a decidedly complex +task: our +[lite model](https://storage.googleapis.com/mediapipe-assets/palm_detection_lite.tflite) +and +[full model](https://storage.googleapis.com/mediapipe-assets/palm_detection_full.tflite) +have to work across a variety of hand sizes with a large scale span (~20x) +relative to the image frame and be able to detect occluded and self-occluded +hands. Whereas faces have high contrast patterns, e.g., in the eye and mouth +region, the lack of such features in hands makes it comparatively difficult to +detect them reliably from their visual features alone. Instead, providing +additional context, like arm, body, or person features, aids accurate hand +localization. + +Our method addresses the above challenges using different strategies. First, we +train a palm detector instead of a hand detector, since estimating bounding +boxes of rigid objects like palms and fists is significantly simpler than +detecting hands with articulated fingers. In addition, as palms are smaller +objects, the non-maximum suppression algorithm works well even for two-hand +self-occlusion cases, like handshakes. Moreover, palms can be modelled using +square bounding boxes (anchors in ML terminology) ignoring other aspect ratios, +and therefore reducing the number of anchors by a factor of 3-5. Second, an +encoder-decoder feature extractor is used for bigger scene context awareness +even for small objects (similar to the RetinaNet approach). Lastly, we minimize +the focal loss during training to support a large amount of anchors resulting +from the high scale variance. + +With the above techniques, we achieve an average precision of 95.7% in palm +detection. Using a regular cross entropy loss and no decoder gives a baseline of +just 86.22%. + +### Hand Landmark Model + +After the palm detection over the whole image our subsequent hand landmark +[model](https://storage.googleapis.com/mediapipe-assets/hand_landmark_full.tflite) +performs precise keypoint localization of 21 3D hand-knuckle coordinates inside +the detected hand regions via regression, that is direct coordinate prediction. +The model learns a consistent internal hand pose representation and is robust +even to partially visible hands and self-occlusions. + +To obtain ground truth data, we have manually annotated ~30K real-world images +with 21 3D coordinates, as shown below (we take Z-value from image depth map, if +it exists per corresponding coordinate). To better cover the possible hand poses +and provide additional supervision on the nature of hand geometry, we also +render a high-quality synthetic hand model over various backgrounds and map it +to the corresponding 3D coordinates. + +![hand_landmarks.png](https://mediapipe.dev/images/mobile/hand_landmarks.png) | +:--------------------------------------------------------: | +*Fig 2. 21 hand landmarks.* | + +![hand_crops.png](https://mediapipe.dev/images/mobile/hand_crops.png) | +:-------------------------------------------------------------------------: | +*Fig 3. Top: Aligned hand crops passed to the tracking network with ground truth annotation. Bottom: Rendered synthetic hand images with ground truth annotation.* | + +## Solution APIs + +### Configuration Options + +Naming style and availability may differ slightly across platforms/languages. + +#### static_image_mode + +If set to `false`, the solution treats the input images as a video stream. It +will try to detect hands in the first input images, and upon a successful +detection further localizes the hand landmarks. In subsequent images, once all +[max_num_hands](#max_num_hands) hands are detected and the corresponding hand +landmarks are localized, it simply tracks those landmarks without invoking +another detection until it loses track of any of the hands. This reduces latency +and is ideal for processing video frames. If set to `true`, hand detection runs +on every input image, ideal for processing a batch of static, possibly +unrelated, images. Default to `false`. + +#### max_num_hands + +Maximum number of hands to detect. Default to `2`. + +#### model_complexity + +Complexity of the hand landmark model: `0` or `1`. Landmark accuracy as well as +inference latency generally go up with the model complexity. Default to `1`. + +#### min_detection_confidence + +Minimum confidence value (`[0.0, 1.0]`) from the hand detection model for the +detection to be considered successful. Default to `0.5`. + +#### min_tracking_confidence: + +Minimum confidence value (`[0.0, 1.0]`) from the landmark-tracking model for the +hand landmarks to be considered tracked successfully, or otherwise hand +detection will be invoked automatically on the next input image. Setting it to a +higher value can increase robustness of the solution, at the expense of a higher +latency. Ignored if [static_image_mode](#static_image_mode) is `true`, where +hand detection simply runs on every image. Default to `0.5`. + +### Output + +Naming style may differ slightly across platforms/languages. + +#### multi_hand_landmarks + +Collection of detected/tracked hands, where each hand is represented as a list +of 21 hand landmarks and each landmark is composed of `x`, `y` and `z`. `x` and +`y` are normalized to `[0.0, 1.0]` by the image width and height respectively. +`z` represents the landmark depth with the depth at the wrist being the origin, +and the smaller the value the closer the landmark is to the camera. The +magnitude of `z` uses roughly the same scale as `x`. + +#### multi_hand_world_landmarks + +Collection of detected/tracked hands, where each hand is represented as a list +of 21 hand landmarks in world coordinates. Each landmark is composed of `x`, `y` +and `z`: real-world 3D coordinates in meters with the origin at the hand's +approximate geometric center. + +#### multi_handedness + +Collection of handedness of the detected/tracked hands (i.e. is it a left or +right hand). Each hand is composed of `label` and `score`. `label` is a string +of value either `"Left"` or `"Right"`. `score` is the estimated probability of +the predicted handedness and is always greater than or equal to `0.5` (and the +opposite handedness has an estimated probability of `1 - score`). + +Note that handedness is determined assuming the input image is mirrored, i.e., +taken with a front-facing/selfie camera with images flipped horizontally. If it +is not the case, please swap the handedness output in the application. + +### Python Solution API + +Please first follow general [instructions](../getting_started/python.md) to +install MediaPipe Python package, then learn more in the companion +[Python Colab](#resources) and the usage example below. + +Supported configuration options: + +* [static_image_mode](#static_image_mode) +* [max_num_hands](#max_num_hands) +* [model_complexity](#model_complexity) +* [min_detection_confidence](#min_detection_confidence) +* [min_tracking_confidence](#min_tracking_confidence) + +```python +import cv2 +import mediapipe as mp +mp_drawing = mp.solutions.drawing_utils +mp_drawing_styles = mp.solutions.drawing_styles +mp_hands = mp.solutions.hands + +# For static images: +IMAGE_FILES = [] +with mp_hands.Hands( + static_image_mode=True, + max_num_hands=2, + min_detection_confidence=0.5) as hands: + for idx, file in enumerate(IMAGE_FILES): + # Read an image, flip it around y-axis for correct handedness output (see + # above). + image = cv2.flip(cv2.imread(file), 1) + # Convert the BGR image to RGB before processing. + results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + # Print handedness and draw hand landmarks on the image. + print('Handedness:', results.multi_handedness) + if not results.multi_hand_landmarks: + continue + image_height, image_width, _ = image.shape + annotated_image = image.copy() + for hand_landmarks in results.multi_hand_landmarks: + print('hand_landmarks:', hand_landmarks) + print( + f'Index finger tip coordinates: (', + f'{hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x * image_width}, ' + f'{hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y * image_height})' + ) + mp_drawing.draw_landmarks( + annotated_image, + hand_landmarks, + mp_hands.HAND_CONNECTIONS, + mp_drawing_styles.get_default_hand_landmarks_style(), + mp_drawing_styles.get_default_hand_connections_style()) + cv2.imwrite( + '/tmp/annotated_image' + str(idx) + '.png', cv2.flip(annotated_image, 1)) + # Draw hand world landmarks. + if not results.multi_hand_world_landmarks: + continue + for hand_world_landmarks in results.multi_hand_world_landmarks: + mp_drawing.plot_landmarks( + hand_world_landmarks, mp_hands.HAND_CONNECTIONS, azimuth=5) + +# For webcam input: +cap = cv2.VideoCapture(0) +with mp_hands.Hands( + model_complexity=0, + min_detection_confidence=0.5, + min_tracking_confidence=0.5) as hands: + while cap.isOpened(): + success, image = cap.read() + if not success: + print("Ignoring empty camera frame.") + # If loading a video, use 'break' instead of 'continue'. + continue + + # To improve performance, optionally mark the image as not writeable to + # pass by reference. + image.flags.writeable = False + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + results = hands.process(image) + + # Draw the hand annotations on the image. + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + if results.multi_hand_landmarks: + for hand_landmarks in results.multi_hand_landmarks: + mp_drawing.draw_landmarks( + image, + hand_landmarks, + mp_hands.HAND_CONNECTIONS, + mp_drawing_styles.get_default_hand_landmarks_style(), + mp_drawing_styles.get_default_hand_connections_style()) + # Flip the image horizontally for a selfie-view display. + cv2.imshow('MediaPipe Hands', cv2.flip(image, 1)) + if cv2.waitKey(5) & 0xFF == 27: + break +cap.release() +``` + +### JavaScript Solution API + +Please first see general [introduction](../getting_started/javascript.md) on +MediaPipe in JavaScript, then learn more in the companion [web demo](#resources) +and a [fun application], and the following usage example. + +Supported configuration options: + +* [maxNumHands](#max_num_hands) +* [modelComplexity](#model_complexity) +* [minDetectionConfidence](#min_detection_confidence) +* [minTrackingConfidence](#min_tracking_confidence) + +```html + + + + + + + + + + + +
+ + +
+ + +``` + +```javascript + +``` + +### Android Solution API + +Please first follow general +[instructions](../getting_started/android_solutions.md) to add MediaPipe Gradle +dependencies and try the Android Solution API in the companion +[example Android Studio project](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/solutions/hands), +and learn more in the usage example below. + +Supported configuration options: + +* [staticImageMode](#static_image_mode) +* [maxNumHands](#max_num_hands) +* runOnGpu: Run the pipeline and the model inference on GPU or CPU. + +#### Camera Input + +```java +// For camera input and result rendering with OpenGL. +HandsOptions handsOptions = + HandsOptions.builder() + .setStaticImageMode(false) + .setMaxNumHands(2) + .setRunOnGpu(true).build(); +Hands hands = new Hands(this, handsOptions); +hands.setErrorListener( + (message, e) -> Log.e(TAG, "MediaPipe Hands error:" + message)); + +// Initializes a new CameraInput instance and connects it to MediaPipe Hands Solution. +CameraInput cameraInput = new CameraInput(this); +cameraInput.setNewFrameListener( + textureFrame -> hands.send(textureFrame)); + +// Initializes a new GlSurfaceView with a ResultGlRenderer instance +// that provides the interfaces to run user-defined OpenGL rendering code. +// See mediapipe/examples/android/solutions/hands/src/main/java/com/google/mediapipe/examples/hands/HandsResultGlRenderer.java +// as an example. +SolutionGlSurfaceView glSurfaceView = + new SolutionGlSurfaceView<>( + this, hands.getGlContext(), hands.getGlMajorVersion()); +glSurfaceView.setSolutionResultRenderer(new HandsResultGlRenderer()); +glSurfaceView.setRenderInputImage(true); + +hands.setResultListener( + handsResult -> { + if (result.multiHandLandmarks().isEmpty()) { + return; + } + NormalizedLandmark wristLandmark = + handsResult.multiHandLandmarks().get(0).getLandmarkList().get(HandLandmark.WRIST); + Log.i( + TAG, + String.format( + "MediaPipe Hand wrist normalized coordinates (value range: [0, 1]): x=%f, y=%f", + wristLandmark.getX(), wristLandmark.getY())); + // Request GL rendering. + glSurfaceView.setRenderData(handsResult); + glSurfaceView.requestRender(); + }); + +// The runnable to start camera after the GLSurfaceView is attached. +glSurfaceView.post( + () -> + cameraInput.start( + this, + hands.getGlContext(), + CameraInput.CameraFacing.FRONT, + glSurfaceView.getWidth(), + glSurfaceView.getHeight())); +``` + +#### Image Input + +```java +// For reading images from gallery and drawing the output in an ImageView. +HandsOptions handsOptions = + HandsOptions.builder() + .setStaticImageMode(true) + .setMaxNumHands(2) + .setRunOnGpu(true).build(); +Hands hands = new Hands(this, handsOptions); + +// Connects MediaPipe Hands Solution to the user-defined ImageView instance that +// allows users to have the custom drawing of the output landmarks on it. +// See mediapipe/examples/android/solutions/hands/src/main/java/com/google/mediapipe/examples/hands/HandsResultImageView.java +// as an example. +HandsResultImageView imageView = new HandsResultImageView(this); +hands.setResultListener( + handsResult -> { + if (result.multiHandLandmarks().isEmpty()) { + return; + } + int width = handsResult.inputBitmap().getWidth(); + int height = handsResult.inputBitmap().getHeight(); + NormalizedLandmark wristLandmark = + handsResult.multiHandLandmarks().get(0).getLandmarkList().get(HandLandmark.WRIST); + Log.i( + TAG, + String.format( + "MediaPipe Hand wrist coordinates (pixel values): x=%f, y=%f", + wristLandmark.getX() * width, wristLandmark.getY() * height)); + // Request canvas drawing. + imageView.setHandsResult(handsResult); + runOnUiThread(() -> imageView.update()); + }); +hands.setErrorListener( + (message, e) -> Log.e(TAG, "MediaPipe Hands error:" + message)); + +// ActivityResultLauncher to get an image from the gallery as Bitmap. +ActivityResultLauncher imageGetter = + registerForActivityResult( + new ActivityResultContracts.StartActivityForResult(), + result -> { + Intent resultIntent = result.getData(); + if (resultIntent != null && result.getResultCode() == RESULT_OK) { + Bitmap bitmap = null; + try { + bitmap = + MediaStore.Images.Media.getBitmap( + this.getContentResolver(), resultIntent.getData()); + // Please also rotate the Bitmap based on its orientation. + } catch (IOException e) { + Log.e(TAG, "Bitmap reading error:" + e); + } + if (bitmap != null) { + hands.send(bitmap); + } + } + }); +Intent pickImageIntent = new Intent(Intent.ACTION_PICK); +pickImageIntent.setDataAndType(MediaStore.Images.Media.INTERNAL_CONTENT_URI, "image/*"); +imageGetter.launch(pickImageIntent); +``` + +#### Video Input + +```java +// For video input and result rendering with OpenGL. +HandsOptions handsOptions = + HandsOptions.builder() + .setStaticImageMode(false) + .setMaxNumHands(2) + .setRunOnGpu(true).build(); +Hands hands = new Hands(this, handsOptions); +hands.setErrorListener( + (message, e) -> Log.e(TAG, "MediaPipe Hands error:" + message)); + +// Initializes a new VideoInput instance and connects it to MediaPipe Hands Solution. +VideoInput videoInput = new VideoInput(this); +videoInput.setNewFrameListener( + textureFrame -> hands.send(textureFrame)); + +// Initializes a new GlSurfaceView with a ResultGlRenderer instance +// that provides the interfaces to run user-defined OpenGL rendering code. +// See mediapipe/examples/android/solutions/hands/src/main/java/com/google/mediapipe/examples/hands/HandsResultGlRenderer.java +// as an example. +SolutionGlSurfaceView glSurfaceView = + new SolutionGlSurfaceView<>( + this, hands.getGlContext(), hands.getGlMajorVersion()); +glSurfaceView.setSolutionResultRenderer(new HandsResultGlRenderer()); +glSurfaceView.setRenderInputImage(true); + +hands.setResultListener( + handsResult -> { + if (result.multiHandLandmarks().isEmpty()) { + return; + } + NormalizedLandmark wristLandmark = + handsResult.multiHandLandmarks().get(0).getLandmarkList().get(HandLandmark.WRIST); + Log.i( + TAG, + String.format( + "MediaPipe Hand wrist normalized coordinates (value range: [0, 1]): x=%f, y=%f", + wristLandmark.getX(), wristLandmark.getY())); + // Request GL rendering. + glSurfaceView.setRenderData(handsResult); + glSurfaceView.requestRender(); + }); + +ActivityResultLauncher videoGetter = + registerForActivityResult( + new ActivityResultContracts.StartActivityForResult(), + result -> { + Intent resultIntent = result.getData(); + if (resultIntent != null) { + if (result.getResultCode() == RESULT_OK) { + glSurfaceView.post( + () -> + videoInput.start( + this, + resultIntent.getData(), + hands.getGlContext(), + glSurfaceView.getWidth(), + glSurfaceView.getHeight())); + } + } + }); +Intent pickVideoIntent = new Intent(Intent.ACTION_PICK); +pickVideoIntent.setDataAndType(MediaStore.Video.Media.INTERNAL_CONTENT_URI, "video/*"); +videoGetter.launch(pickVideoIntent); +``` + +## Example Apps + +Please first see general instructions for +[Android](../getting_started/android.md), [iOS](../getting_started/ios.md) and +[desktop](../getting_started/cpp.md) on how to build MediaPipe examples. + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +### Mobile + +#### Main Example + +* Graph: + [`mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/hand_tracking_mobile.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1uCjS0y0O0dTDItsMh8x2cf4-l3uHW1vE) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu:handtrackinggpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/BUILD) +* iOS target: + [`mediapipe/examples/ios/handtrackinggpu:HandTrackingGpuApp`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/handtrackinggpu/BUILD) + +Tip: Maximum number of hands to detect/process is set to 2 by default. To change +it, for Android modify `NUM_HANDS` in +[MainActivity.java](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handtrackinggpu/MainActivity.java), +and for iOS modify `kNumHands` in +[HandTrackingViewController.mm](https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/handtrackinggpu/HandTrackingViewController.mm). + +#### Palm/Hand Detection Only (no landmarks) + +* Graph: + [`mediapipe/graphs/hand_tracking/hand_detection_mobile.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/hand_detection_mobile.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1qUlTtH7Ydg-wl_H6VVL8vueu2UCTu37E) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/handdetectiongpu:handdetectiongpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/handdetectiongpu/BUILD) +* iOS target: + [`mediapipe/examples/ios/handdetectiongpu:HandDetectionGpuApp`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/ios/handdetectiongpu/BUILD) + +### Desktop + +* Running on CPU + * Graph: + [`mediapipe/graphs/hand_tracking/hand_tracking_desktop_live.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/hand_tracking_desktop_live.pbtxt) + * Target: + [`mediapipe/examples/desktop/hand_tracking:hand_tracking_cpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/hand_tracking/BUILD) +* Running on GPU + * Graph: + [`mediapipe/graphs/hand_tracking/hand_tracking_desktop_live_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/hand_tracking/hand_tracking_desktop_gpu.pbtxt) + * Target: + [`mediapipe/examples/desktop/hand_tracking:hand_tracking_gpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/hand_tracking/BUILD) + +Tip: Maximum number of hands to detect/process is set to 2 by default. To change +it, in the graph file modify the option of `ConstantSidePacketCalculator`. + +## Resources + +* Google AI Blog: + [On-Device, Real-Time Hand Tracking with MediaPipe](https://ai.googleblog.com/2019/08/on-device-real-time-hand-tracking-with.html) +* TensorFlow Blog: + [Face and hand tracking in the browser with MediaPipe and TensorFlow.js](https://blog.tensorflow.org/2020/03/face-and-hand-tracking-in-browser-with-mediapipe-and-tensorflowjs.html) +* Paper: + [MediaPipe Hands: On-device Real-time Hand Tracking](https://arxiv.org/abs/2006.10214) + ([presentation](https://www.youtube.com/watch?v=I-UOrvxxXEk)) +* [Models and model cards](./models.md#hands) +* [Web demo](https://code.mediapipe.dev/codepen/hands) +* [Fun application](https://code.mediapipe.dev/codepen/defrost) +* [Python Colab](https://mediapipe.page.link/hands_py_colab) diff --git a/docs/solutions/holistic.md b/docs/solutions/holistic.md new file mode 100644 index 0000000..6909096 --- /dev/null +++ b/docs/solutions/holistic.md @@ -0,0 +1,502 @@ +--- +layout: forward +target: https://github.com/google/mediapipe/blob/master/docs/solutions/holistic.md +title: Holistic +parent: MediaPipe Legacy Solutions +nav_order: 6 +--- + +# MediaPipe Holistic +{: .no_toc } + +
+ + Table of contents + + {: .text-delta } +1. TOC +{:toc} +
+--- + +**Attention:** *Thank you for your interest in MediaPipe Solutions. +As of March 1, 2023, this solution is planned to be upgraded to a new MediaPipe +Solution. For more information, see the +[MediaPipe Solutions](https://developers.google.com/mediapipe/solutions/guide#legacy) +site.* + +---- + +## Overview + +Live perception of simultaneous [human pose](./pose.md), +[face landmarks](./face_mesh.md), and [hand tracking](./hands.md) in real-time +on mobile devices can enable various modern life applications: fitness and sport +analysis, gesture control and sign language recognition, augmented reality +try-on and effects. MediaPipe already offers fast and accurate, yet separate, +solutions for these tasks. Combining them all in real-time into a semantically +consistent end-to-end solution is a uniquely difficult problem requiring +simultaneous inference of multiple, dependent neural networks. + +![holistic_sports_and_gestures_example.gif](https://mediapipe.dev/images/mobile/holistic_sports_and_gestures_example.gif) | +:----------------------------------------------------------------------------------------------------: | +*Fig 1. Example of MediaPipe Holistic.* | + +## ML Pipeline + +The MediaPipe Holistic pipeline integrates separate models for +[pose](./pose.md), [face](./face_mesh.md) and [hand](./hands.md) components, +each of which are optimized for their particular domain. However, because of +their different specializations, the input to one component is not well-suited +for the others. The pose estimation model, for example, takes a lower, fixed +resolution video frame (256x256) as input. But if one were to crop the hand and +face regions from that image to pass to their respective models, the image +resolution would be too low for accurate articulation. Therefore, we designed +MediaPipe Holistic as a multi-stage pipeline, which treats the different regions +using a region appropriate image resolution. + +First, we estimate the human pose (top of Fig 2) with [BlazePose](./pose.md)’s +pose detector and subsequent landmark model. Then, using the inferred pose +landmarks we derive three regions of interest (ROI) crops for each hand (2x) and +the face, and employ a re-crop model to improve the ROI. We then crop the +full-resolution input frame to these ROIs and apply task-specific face and hand +models to estimate their corresponding landmarks. Finally, we merge all +landmarks with those of the pose model to yield the full 540+ landmarks. + +![holistic_pipeline_example.jpg](https://mediapipe.dev/images/mobile/holistic_pipeline_example.jpg) | +:------------------------------------------------------------------------------: | +*Fig 2. MediaPipe Holistic Pipeline Overview.* | + +To streamline the identification of ROIs for face and hands, we utilize a +tracking approach similar to the one we use for standalone +[face](./face_mesh.md) and [hand](./hands.md) pipelines. It assumes that the +object doesn't move significantly between frames and uses estimation from the +previous frame as a guide to the object region on the current one. However, +during fast movements, the tracker can lose the target, which requires the +detector to re-localize it in the image. MediaPipe Holistic uses +[pose](./pose.md) prediction (on every frame) as an additional ROI prior to +reducing the response time of the pipeline when reacting to fast movements. This +also enables the model to retain semantic consistency across the body and its +parts by preventing a mixup between left and right hands or body parts of one +person in the frame with another. + +In addition, the resolution of the input frame to the pose model is low enough +that the resulting ROIs for face and hands are still too inaccurate to guide the +re-cropping of those regions, which require a precise input crop to remain +lightweight. To close this accuracy gap we use lightweight face and hand re-crop +models that play the role of +[spatial transformers](https://arxiv.org/abs/1506.02025) and cost only ~10% of +corresponding model's inference time. + +The pipeline is implemented as a MediaPipe +[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/holistic_tracking/holistic_tracking_gpu.pbtxt) +that uses a +[holistic landmark subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/holistic_landmark/holistic_landmark_gpu.pbtxt) +from the +[holistic landmark module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/holistic_landmark) +and renders using a dedicated +[holistic renderer subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/holistic_tracking/holistic_tracking_to_render_data.pbtxt). +The +[holistic landmark subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/holistic_landmark/holistic_landmark_gpu.pbtxt) +internally uses a +[pose landmark module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/pose_landmark) +, +[hand landmark module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/hand_landmark) +and +[face landmark module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_landmark/). +Please check them for implementation details. + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +## Models + +### Landmark Models + +MediaPipe Holistic utilizes the pose, face and hand landmark models in +[MediaPipe Pose](./pose.md), [MediaPipe Face Mesh](./face_mesh.md) and +[MediaPipe Hands](./hands.md) respectively to generate a total of 543 landmarks +(33 pose landmarks, 468 face landmarks, and 21 hand landmarks per hand). + +### Hand Recrop Model + +For cases when the accuracy of the pose model is low enough that the resulting +ROIs for hands are still too inaccurate we run the additional lightweight hand +re-crop model that play the role of +[spatial transformer](https://arxiv.org/abs/1506.02025) and cost only ~10% of +hand model inference time. + +## Solution APIs + +### Cross-platform Configuration Options + +Naming style and availability may differ slightly across platforms/languages. + +#### static_image_mode + +If set to `false`, the solution treats the input images as a video stream. It +will try to detect the most prominent person in the very first images, and upon +a successful detection further localizes the pose and other landmarks. In +subsequent images, it then simply tracks those landmarks without invoking +another detection until it loses track, on reducing computation and latency. If +set to `true`, person detection runs every input image, ideal for processing a +batch of static, possibly unrelated, images. Default to `false`. + +#### model_complexity + +Complexity of the pose landmark model: `0`, `1` or `2`. Landmark accuracy as +well as inference latency generally go up with the model complexity. Default to +`1`. + +#### smooth_landmarks + +If set to `true`, the solution filters pose landmarks across different input +images to reduce jitter, but ignored if [static_image_mode](#static_image_mode) +is also set to `true`. Default to `true`. + +#### enable_segmentation + +If set to `true`, in addition to the pose, face and hand landmarks the solution +also generates the segmentation mask. Default to `false`. + +#### smooth_segmentation + +If set to `true`, the solution filters segmentation masks across different input +images to reduce jitter. Ignored if [enable_segmentation](#enable_segmentation) +is `false` or [static_image_mode](#static_image_mode) is `true`. Default to +`true`. + +#### refine_face_landmarks + +Whether to further refine the landmark coordinates around the eyes and lips, and +output additional landmarks around the irises. Default to `false`. + +#### min_detection_confidence + +Minimum confidence value (`[0.0, 1.0]`) from the person-detection model for the +detection to be considered successful. Default to `0.5`. + +#### min_tracking_confidence + +Minimum confidence value (`[0.0, 1.0]`) from the landmark-tracking model for the +pose landmarks to be considered tracked successfully, or otherwise person +detection will be invoked automatically on the next input image. Setting it to a +higher value can increase robustness of the solution, at the expense of a higher +latency. Ignored if [static_image_mode](#static_image_mode) is `true`, where +person detection simply runs on every image. Default to `0.5`. + +### Output + +Naming style may differ slightly across platforms/languages. + +#### pose_landmarks + +A list of pose landmarks. Each landmark consists of the following: + +* `x` and `y`: Landmark coordinates normalized to `[0.0, 1.0]` by the image + width and height respectively. +* `z`: Should be discarded as currently the model is not fully trained to + predict depth, but this is something on the roadmap. +* `visibility`: A value in `[0.0, 1.0]` indicating the likelihood of the + landmark being visible (present and not occluded) in the image. + +#### pose_world_landmarks + +Another list of pose landmarks in world coordinates. Each landmark consists of +the following: + +* `x`, `y` and `z`: Real-world 3D coordinates in meters with the origin at the + center between hips. +* `visibility`: Identical to that defined in the corresponding + [pose_landmarks](#pose_landmarks). + +#### face_landmarks + +A list of 468 face landmarks. Each landmark consists of `x`, `y` and `z`. `x` +and `y` are normalized to `[0.0, 1.0]` by the image width and height +respectively. `z` represents the landmark depth with the depth at center of the +head being the origin, and the smaller the value the closer the landmark is to +the camera. The magnitude of `z` uses roughly the same scale as `x`. + +#### left_hand_landmarks + +A list of 21 hand landmarks on the left hand. Each landmark consists of `x`, `y` +and `z`. `x` and `y` are normalized to `[0.0, 1.0]` by the image width and +height respectively. `z` represents the landmark depth with the depth at the +wrist being the origin, and the smaller the value the closer the landmark is to +the camera. The magnitude of `z` uses roughly the same scale as `x`. + +#### right_hand_landmarks + +A list of 21 hand landmarks on the right hand, in the same representation as +[left_hand_landmarks](#left_hand_landmarks). + +#### segmentation_mask + +The output segmentation mask, predicted only when +[enable_segmentation](#enable_segmentation) is set to `true`. The mask has the +same width and height as the input image, and contains values in `[0.0, 1.0]` +where `1.0` and `0.0` indicate high certainty of a "human" and "background" +pixel respectively. Please refer to the platform-specific usage examples below +for usage details. + +### Python Solution API + +Please first follow general [instructions](../getting_started/python.md) to +install MediaPipe Python package, then learn more in the companion +[Python Colab](#resources) and the usage example below. + +Supported configuration options: + +* [static_image_mode](#static_image_mode) +* [model_complexity](#model_complexity) +* [smooth_landmarks](#smooth_landmarks) +* [enable_segmentation](#enable_segmentation) +* [smooth_segmentation](#smooth_segmentation) +* [refine_face_landmarks](#refine_face_landmarks) +* [min_detection_confidence](#min_detection_confidence) +* [min_tracking_confidence](#min_tracking_confidence) + +```python +import cv2 +import mediapipe as mp +mp_drawing = mp.solutions.drawing_utils +mp_drawing_styles = mp.solutions.drawing_styles +mp_holistic = mp.solutions.holistic + +# For static images: +IMAGE_FILES = [] +BG_COLOR = (192, 192, 192) # gray +with mp_holistic.Holistic( + static_image_mode=True, + model_complexity=2, + enable_segmentation=True, + refine_face_landmarks=True) as holistic: + for idx, file in enumerate(IMAGE_FILES): + image = cv2.imread(file) + image_height, image_width, _ = image.shape + # Convert the BGR image to RGB before processing. + results = holistic.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) + + if results.pose_landmarks: + print( + f'Nose coordinates: (' + f'{results.pose_landmarks.landmark[mp_holistic.PoseLandmark.NOSE].x * image_width}, ' + f'{results.pose_landmarks.landmark[mp_holistic.PoseLandmark.NOSE].y * image_height})' + ) + + annotated_image = image.copy() + # Draw segmentation on the image. + # To improve segmentation around boundaries, consider applying a joint + # bilateral filter to "results.segmentation_mask" with "image". + condition = np.stack((results.segmentation_mask,) * 3, axis=-1) > 0.1 + bg_image = np.zeros(image.shape, dtype=np.uint8) + bg_image[:] = BG_COLOR + annotated_image = np.where(condition, annotated_image, bg_image) + # Draw pose, left and right hands, and face landmarks on the image. + mp_drawing.draw_landmarks( + annotated_image, + results.face_landmarks, + mp_holistic.FACEMESH_TESSELATION, + landmark_drawing_spec=None, + connection_drawing_spec=mp_drawing_styles + .get_default_face_mesh_tesselation_style()) + mp_drawing.draw_landmarks( + annotated_image, + results.pose_landmarks, + mp_holistic.POSE_CONNECTIONS, + landmark_drawing_spec=mp_drawing_styles. + get_default_pose_landmarks_style()) + cv2.imwrite('/tmp/annotated_image' + str(idx) + '.png', annotated_image) + # Plot pose world landmarks. + mp_drawing.plot_landmarks( + results.pose_world_landmarks, mp_holistic.POSE_CONNECTIONS) + +# For webcam input: +cap = cv2.VideoCapture(0) +with mp_holistic.Holistic( + min_detection_confidence=0.5, + min_tracking_confidence=0.5) as holistic: + while cap.isOpened(): + success, image = cap.read() + if not success: + print("Ignoring empty camera frame.") + # If loading a video, use 'break' instead of 'continue'. + continue + + # To improve performance, optionally mark the image as not writeable to + # pass by reference. + image.flags.writeable = False + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + results = holistic.process(image) + + # Draw landmark annotation on the image. + image.flags.writeable = True + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + mp_drawing.draw_landmarks( + image, + results.face_landmarks, + mp_holistic.FACEMESH_CONTOURS, + landmark_drawing_spec=None, + connection_drawing_spec=mp_drawing_styles + .get_default_face_mesh_contours_style()) + mp_drawing.draw_landmarks( + image, + results.pose_landmarks, + mp_holistic.POSE_CONNECTIONS, + landmark_drawing_spec=mp_drawing_styles + .get_default_pose_landmarks_style()) + # Flip the image horizontally for a selfie-view display. + cv2.imshow('MediaPipe Holistic', cv2.flip(image, 1)) + if cv2.waitKey(5) & 0xFF == 27: + break +cap.release() +``` + +### JavaScript Solution API + +Please first see general [introduction](../getting_started/javascript.md) on +MediaPipe in JavaScript, then learn more in the companion [web demo](#resources) +and the following usage example. + +Supported configuration options: + +* [modelComplexity](#model_complexity) +* [smoothLandmarks](#smooth_landmarks) +* [enableSegmentation](#enable_segmentation) +* [smoothSegmentation](#smooth_segmentation) +* [refineFaceLandmarks](#refineFaceLandmarks) +* [minDetectionConfidence](#min_detection_confidence) +* [minTrackingConfidence](#min_tracking_confidence) + +```html + + + + + + + + + + + +
+ + +
+ + +``` + +```javascript + +``` + +## Example Apps + +Please first see general instructions for +[Android](../getting_started/android.md), [iOS](../getting_started/ios.md), and +[desktop](../getting_started/cpp.md) on how to build MediaPipe examples. + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +### Mobile + +* Graph: + [`mediapipe/graphs/holistic_tracking/holistic_tracking_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/holistic_tracking/holistic_tracking_gpu.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/file/d/1o-Trp2GIRitA0OvmZWUQjVMa476xpfgK/view?usp=sharing) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/holistictrackinggpu:holistictrackinggpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/holistictrackinggpu/BUILD) +* iOS target: + [`mediapipe/examples/ios/holistictrackinggpu:HolisticTrackingGpuApp`](http:/mediapipe/examples/ios/holistictrackinggpu/BUILD) + +### Desktop + +Please first see general instructions for [desktop](../getting_started/cpp.md) +on how to build MediaPipe examples. + +* Running on CPU + * Graph: + [`mediapipe/graphs/holistic_tracking/holistic_tracking_cpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/holistic_tracking/holistic_tracking_cpu.pbtxt) + * Target: + [`mediapipe/examples/desktop/holistic_tracking:holistic_tracking_cpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/holistic_tracking/BUILD) +* Running on GPU + * Graph: + [`mediapipe/graphs/holistic_tracking/holistic_tracking_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/holistic_tracking/holistic_tracking_gpu.pbtxt) + * Target: + [`mediapipe/examples/desktop/holistic_tracking:holistic_tracking_gpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/holistic_tracking/BUILD) + +## Resources + +* Google AI Blog: + [MediaPipe Holistic - Simultaneous Face, Hand and Pose Prediction, on Device](https://ai.googleblog.com/2020/12/mediapipe-holistic-simultaneous-face.html) +* [Models and model cards](./models.md#holistic) +* [Web demo](https://code.mediapipe.dev/codepen/holistic) +* [Python Colab](https://mediapipe.page.link/holistic_py_colab) diff --git a/docs/solutions/instant_motion_tracking.md b/docs/solutions/instant_motion_tracking.md new file mode 100644 index 0000000..361bc91 --- /dev/null +++ b/docs/solutions/instant_motion_tracking.md @@ -0,0 +1,155 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/solutions/guide#legacy +title: Instant Motion Tracking +parent: MediaPipe Legacy Solutions +nav_order: 11 +--- + +# MediaPipe Instant Motion Tracking +{: .no_toc } + +
+ + Table of contents + + {: .text-delta } +1. TOC +{:toc} +
+--- + +**Attention:** *Thank you for your interest in MediaPipe Solutions. +We have ended support for this MediaPipe Legacy Solution as of March 1, 2023. +For more information, see the +[MediaPipe Solutions](https://developers.google.com/mediapipe/solutions/guide#legacy) +site.* + +---- + +## Overview + +Augmented Reality (AR) technology creates fun, engaging, and immersive user +experiences. The ability to perform AR tracking across devices and platforms, +without initialization, remains important to power AR applications at scale. + +MediaPipe Instant Motion Tracking provides AR tracking across devices and +platforms without initialization or calibration. It is built upon the +[MediaPipe Box Tracking](./box_tracking.md) solution. With Instant Motion +Tracking, you can easily place virtual 2D and 3D content on static or moving +surfaces, allowing them to seamlessly interact with the real-world environment. + +![instant_motion_tracking_android_small](https://mediapipe.dev/images/mobile/instant_motion_tracking_android_small.gif) | +:-----------------------------------------------------------------------: | +*Fig 1. Instant Motion Tracking is used to augment the world with a 3D sticker.* | + +## Pipeline + +The Instant Motion Tracking pipeline is implemented as a MediaPipe +[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/instant_motion_tracking/instant_motion_tracking.pbtxt), +which internally utilizes a +[RegionTrackingSubgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/instant_motion_tracking/subgraphs/region_tracking.pbtxt) +in order to perform anchor tracking for each individual 3D sticker. + +We first use a +[StickerManagerCalculator](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/instant_motion_tracking/calculators/sticker_manager_calculator.cc) +to prepare the individual sticker data for the rest of the application. This +information is then sent to the +[RegionTrackingSubgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/instant_motion_tracking/subgraphs/region_tracking.pbtxt) +that performs 3D region tracking for sticker placement and rendering. Once +acquired, our tracked sticker regions are sent with user transformations (i.e. +gestures from the user to rotate and zoom the sticker) and IMU data to the +[MatricesManagerCalculator](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/instant_motion_tracking/calculators/matrices_manager_calculator.cc), +which turns all our sticker transformation data into a set of model matrices. +This data is handled directly by our +[GlAnimationOverlayCalculator](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.cc) +as an input stream, which will render the provided texture and object file using +our matrix specifications. The output of +[GlAnimationOverlayCalculator](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.cc) +is a video stream depicting the virtual 3D content rendered on top of the real +world, creating immersive AR experiences for users. + +## Using Instant Motion Tracking + +With the Instant Motion Tracking MediaPipe [graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/instant_motion_tracking/instant_motion_tracking.pbtxt), +an application can create an interactive and realistic AR experience by +specifying the required input streams, side packets, and output streams. +The input streams are the following: + +* Input Video (GpuBuffer): Video frames to render augmented stickers onto. +* Rotation Matrix (9-element Float Array): The 3x3 row-major rotation +matrix from the device IMU to determine proper orientation of the device. +* Sticker Proto String (String): A string representing the +serialized [sticker buffer protobuf message](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/instant_motion_tracking/calculators/sticker_buffer.proto), +containing a list of all stickers and their attributes. + * Each sticker in the Protobuffer has a unique ID to find associated + anchors and transforms, an initial anchor placement in a normalized [0.0, 1.0] + 3D space, a user rotation and user scaling transform on the sticker, + and an integer indicating which type of objects to render for the + sticker (e.g. 3D asset or GIF). +* Sticker Sentinel (Integer): When an anchor must be initially placed or +repositioned, this value must be changed to the ID of the anchor to reset from +the sticker buffer protobuf message. If no valid ID is provided, the system +will simply maintain tracking. + +Side packets are also an integral part of the Instant Motion Tracking solution +to provide device-specific information for the rendering system: + +* Field of View (Float): The field of view of the camera in radians. +* Aspect Ratio (Float): The aspect ratio (width / height) of the camera frames + (this ratio corresponds to the image frames themselves, not necessarily the + screen bounds). +* Object Asset (String): The + [GlAnimationOverlayCalculator](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.cc) + must be provided with an associated asset file name pointing to the 3D model + to render in the viewfinder. +* (Optional) Texture (ImageFrame on Android, GpuBuffer on iOS): Textures for + the + [GlAnimationOverlayCalculator](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.cc) + can be provided either via an input stream (dynamic texturing) or as a side + packet (unchanging texture). + +The rendering system for the Instant Motion Tracking is powered by OpenGL. For +more information regarding the structure of model matrices and OpenGL rendering, +please visit [OpenGL Wiki](https://www.khronos.org/opengl/wiki/). With the +specifications above, the Instant Motion Tracking capabilities can be adapted to +any device that is able to run the MediaPipe framework with a working IMU system +and connected camera. + +## Example Apps + +Please first see general instructions for +[Android](../getting_started/android.md) on how to build MediaPipe examples. + +* Graph: [mediapipe/graphs/instant_motion_tracking/instant_motion_tracking.pbtxt](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/instant_motion_tracking/instant_motion_tracking.pbtxt) + +* Android target (or download prebuilt [ARM64 APK](https://drive.google.com/file/d/1KnaBBoKpCHR73nOBJ4fL_YdWVTAcwe6L/view?usp=sharing)): +[`mediapipe/examples/android/src/java/com/google/mediapipe/apps/instantmotiontracking:instantmotiontracking`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/instantmotiontracking/BUILD) + +* Assets rendered by the [GlAnimationOverlayCalculator](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/calculators/gl_animation_overlay_calculator.cc) must be preprocessed into an OpenGL-ready custom .uuu format. This can be done +for user assets as follows: +> First run +> +> ```shell +> ./mediapipe/graphs/object_detection_3d/obj_parser/obj_cleanup.sh [INPUT_DIR] [INTERMEDIATE_OUTPUT_DIR] +> ``` +> and then run +> +> ```build +> bazel run -c opt mediapipe/graphs/object_detection_3d/obj_parser:ObjParser -- input_dir=[INTERMEDIATE_OUTPUT_DIR] output_dir=[OUTPUT_DIR] +> ``` +> INPUT_DIR should be the folder with initial asset .obj files to be processed, +> and OUTPUT_DIR is the folder where the processed asset .uuu file will be placed. +> +> Note: ObjParser combines all .obj files found in the given directory into a +> single .uuu animation file, using the order given by sorting the filenames alphanumerically. Also the ObjParser directory inputs must be given as +> absolute paths, not relative paths. See parser utility library at [`mediapipe/graphs/object_detection_3d/obj_parser/`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/object_detection_3d/obj_parser/) for more details. + +## Resources + +* Google Developers Blog: + [Instant Motion Tracking With MediaPipe](https://developers.googleblog.com/2020/08/instant-motion-tracking-with-mediapipe.html) +* Google AI Blog: + [The Instant Motion Tracking Behind Motion Stills AR](https://ai.googleblog.com/2018/02/the-instant-motion-tracking-behind.html) +* Paper: + [Instant Motion Tracking and Its Applications to Augmented Reality](https://arxiv.org/abs/1907.06796) diff --git a/docs/solutions/iris.md b/docs/solutions/iris.md new file mode 100644 index 0000000..c0af434 --- /dev/null +++ b/docs/solutions/iris.md @@ -0,0 +1,224 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/solutions/vision/face_landmarker/ +title: Iris +parent: MediaPipe Legacy Solutions +nav_order: 3 +--- + +# MediaPipe Iris +{: .no_toc } + +
+ + Table of contents + + {: .text-delta } +1. TOC +{:toc} +
+--- + +**Attention:** *Thank you for your interest in MediaPipe Solutions. +As of May 10, 2023, this solution was upgraded to a new MediaPipe +Solution. For more information, see the +[MediaPipe Solutions](https://developers.google.com/mediapipe/solutions/vision/face_landmarker) +site.* + +---- + +## Overview + +A wide range of real-world applications, including computational photography +(glint reflection) and augmented reality effects (virtual avatars) rely on +accurately tracking the iris within an eye. This is a challenging task to solve +on mobile devices, due to the limited computing resources, variable light +conditions and the presence of occlusions, such as hair or people squinting. +Iris tracking can also be utilized to determine the metric distance of the +camera to the user. This can improve a variety of use cases, ranging from +virtual try-on of properly sized glasses and hats to accessibility features that +adopt the font size depending on the viewer’s distance. Often, sophisticated +specialized hardware is employed to compute the metric distance, limiting the +range of devices on which the solution could be applied. + +MediaPipe Iris is a ML solution for accurate iris estimation, able to track +landmarks involving the iris, pupil and the eye contours using a single RGB +camera, in real-time, without the need for specialized hardware. Through use of +iris landmarks, the solution is also able to determine the metric distance +between the subject and the camera with relative error less than 10%. Note that +iris tracking does not infer the location at which people are looking, nor does +it provide any form of identity recognition. With the cross-platform capability +of the MediaPipe framework, MediaPipe Iris can run on most modern +[mobile phones](#mobile), [desktops/laptops](#desktop) and even on the +[web](#web). + +![iris_tracking_example.gif](https://mediapipe.dev/images/mobile/iris_tracking_example.gif) | +:------------------------------------------------------------------------: | +*Fig 1. Example of MediaPipe Iris: eyelid (red) and iris (blue) contours.* | + +## ML Pipeline + +The first step in the pipeline leverages [MediaPipe Face Mesh](./face_mesh.md), +which generates a mesh of the approximate face geometry. From this mesh, we +isolate the eye region in the original image for use in the subsequent iris +tracking step. + +The pipeline is implemented as a MediaPipe +[graph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/iris_tracking/iris_tracking_gpu.pbtxt) +that uses a +[face landmark subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_landmark/face_landmark_front_gpu.pbtxt) +from the +[face landmark module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_landmark), +an +[iris landmark subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/iris_landmark/iris_landmark_left_and_right_gpu.pbtxt) +from the +[iris landmark module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/iris_landmark), +and renders using a dedicated +[iris-and-depth renderer subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/iris_tracking/subgraphs/iris_and_depth_renderer_gpu.pbtxt). +The +[face landmark subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_landmark/face_landmark_front_gpu.pbtxt) +internally uses a +[face detection subgraph](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_detection/face_detection_short_range_gpu.pbtxt) +from the +[face detection module](https://github.com/google/mediapipe/tree/master/mediapipe/modules/face_detection). + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +The output of the pipeline is a set of 478 3D landmarks, including 468 face +landmarks from [MediaPipe Face Mesh](./face_mesh.md), with those around the eyes +further refined (see Fig 2), and 10 additional iris landmarks appended at the +end (5 for each eye, and see Fig 2 also). + +## Models + +### Face Detection Model + +The face detector is the same [BlazeFace](https://arxiv.org/abs/1907.05047) +model used in [MediaPipe Face Detection](./face_detection.md). + +### Face Landmark Model + +The face landmark model is the same as in [MediaPipe Face Mesh](./face_mesh.md). +You can also find more details in this +[paper](https://arxiv.org/abs/1907.06724). + +### Iris Landmark Model + +The iris model takes an image patch of the eye region and estimates both the eye +landmarks (along the eyelid) and iris landmarks (along this iris contour). You +can find more details in this [paper](https://arxiv.org/abs/2006.11341). + +![iris_tracking_eye_and_iris_landmarks.png](https://mediapipe.dev/images/mobile/iris_tracking_eye_and_iris_landmarks.png) | +:----------------------------------------------------------------------------------------------------: | +*Fig 2. Eye landmarks (red) and iris landmarks (green).* | + +## Depth-from-Iris + +MediaPipe Iris is able to determine the metric distance of a subject to the +camera with less than 10% error, without requiring any specialized hardware. +This is done by relying on the fact that the horizontal iris diameter of the +human eye remains roughly constant at 11.7±0.5 mm across a wide population, +along with some simple geometric arguments. For more details please refer to our +[Google AI Blog post](https://ai.googleblog.com/2020/08/mediapipe-iris-real-time-iris-tracking.html). + +![iris_tracking_depth_from_iris.gif](https://mediapipe.dev/images/mobile/iris_tracking_depth_from_iris.gif) | +:--------------------------------------------------------------------------------------------: | +*Fig 3. (Left) MediaPipe Iris predicting metric distance in cm on a Pixel 2 from iris tracking without use of a depth sensor. (Right) Ground-truth depth.* | + +## Example Apps + +Please first see general instructions for +[Android](../getting_started/android.md), [iOS](../getting_started/ios.md) and +[desktop](../getting_started/cpp.md) on how to build MediaPipe examples. + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +### Mobile + +* Graph: + [`mediapipe/graphs/iris_tracking/iris_tracking_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/iris_tracking/iris_tracking_gpu.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/file/d/1cywcNtqk764TlZf1lvSTV4F3NGB2aL1R/view?usp=sharing) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/iristrackinggpu:iristrackinggpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/iristrackinggpu/BUILD) +* iOS target: + [`mediapipe/examples/ios/iristrackinggpu:IrisTrackingGpuApp`](http:/mediapipe/examples/ios/iristrackinggpu/BUILD) + +### Desktop + +#### Live Camera Input + +Please first see general instructions for [desktop](../getting_started/cpp.md) +on how to build MediaPipe examples. + +* Running on CPU + * Graph: + [`mediapipe/graphs/iris_tracking/iris_tracking_cpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/iris_tracking/iris_tracking_cpu.pbtxt) + * Target: + [`mediapipe/examples/desktop/iris_tracking:iris_tracking_cpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/iris_tracking/BUILD) +* Running on GPU + * Graph: + [`mediapipe/graphs/iris_tracking/iris_tracking_gpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/iris_tracking/iris_tracking_gpu.pbtxt) + * Target: + [`mediapipe/examples/desktop/iris_tracking:iris_tracking_gpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/desktop/iris_tracking/BUILD) + +#### Video File Input + +1. To build the application, run: + + ```bash + bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 mediapipe/examples/desktop/iris_tracking:iris_tracking_cpu_video_input + ``` + +2. To run the application, replace `` and `` in the command below with your own paths: + + ``` + bazel-bin/mediapipe/examples/desktop/iris_tracking/iris_tracking_cpu_video_input \ + --calculator_graph_config_file=mediapipe/graphs/iris_tracking/iris_tracking_cpu_video_input.pbtxt \ + --input_side_packets=input_video_path=,output_video_path= + ``` + +#### Single-image Depth Estimation + +1. To build the application, run: + + ```bash + bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 mediapipe/examples/desktop/iris_tracking:iris_depth_from_image_desktop + ``` + +2. To run the application, replace `` and `` in the command below with your own paths: + + ```bash + GLOG_logtostderr=1 bazel-bin/mediapipe/examples/desktop/iris_tracking/iris_depth_from_image_desktop \ + --input_image_path= --output_image_path= + ``` + +### Web + +You can use the following links to load a demo in the MediaPipe Visualizer, and +over there click the "Runner" icon in the top bar like shown below. The demos +use your webcam video as input, which is processed all locally in real-time and +never leaves your device. Please see +[MediaPipe on the Web](https://developers.googleblog.com/2020/01/mediapipe-on-web.html) +in Google Developers Blog for details. + +![visualizer_runner](https://mediapipe.dev/images/visualizer_runner.png) + +* [MediaPipe Iris](https://viz.mediapipe.dev/demo/iris_tracking) +* [MediaPipe Iris: Depth-from-Iris](https://viz.mediapipe.dev/demo/iris_depth) + +## Resources + +* Google AI Blog: + [MediaPipe Iris: Real-time Eye Tracking and Depth Estimation](https://ai.googleblog.com/2020/08/mediapipe-iris-real-time-iris-tracking.html) +* Paper: + [Real-time Pupil Tracking from Monocular Video for Digital Puppetry](https://arxiv.org/abs/2006.11341) + ([presentation](https://youtu.be/cIhXkiiapQI)) +* [Models and model cards](./models.md#iris) diff --git a/docs/solutions/knift.md b/docs/solutions/knift.md new file mode 100644 index 0000000..19e04cb --- /dev/null +++ b/docs/solutions/knift.md @@ -0,0 +1,157 @@ +--- +layout: forward +target: https://developers.google.com/mediapipe/solutions/guide#legacy +title: KNIFT (Template-based Feature Matching) +parent: MediaPipe Legacy Solutions +nav_order: 13 +--- + +# MediaPipe KNIFT +{: .no_toc } + +
+ + Table of contents + + {: .text-delta } +1. TOC +{:toc} +
+--- + +**Attention:** *Thank you for your interest in MediaPipe Solutions. +We have ended support for this MediaPipe Legacy Solution as of March 1, 2023. +For more information, see the +[MediaPipe Solutions](https://developers.google.com/mediapipe/solutions/guide#legacy) +site.* + +---- + +## Overview + +MediaPipe KNIFT is a template-based feature matching solution using KNIFT +(Keypoint Neural Invariant Feature Transform). + +![knift_stop_sign.gif](https://mediapipe.dev/images/knift_stop_sign.gif) | +:-----------------------------------------------------------------------: | +*Fig 1. Matching a real Stop Sign with a Stop Sign template using KNIFT.* | + +In many computer vision applications, a crucial building block is to establish +reliable correspondences between different views of an object or scene, forming +the foundation for approaches like template matching, image retrieval and +structure from motion. Correspondences are usually computed by extracting +distinctive view-invariant features such as +[SIFT](https://en.wikipedia.org/wiki/Scale-invariant_feature_transform) or +[ORB](https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_feature2d/py_orb/py_orb.html#orb-in-opencv) +from images. The ability to reliably establish such correspondences enables +applications like image stitching to create panoramas or template matching for +object recognition in videos. + +KNIFT is a general purpose local feature descriptor similar to SIFT or ORB. +Likewise, KNIFT is also a compact vector representation of local image patches +that is invariant to uniform scaling, orientation, and illumination changes. +However unlike SIFT or ORB, which were engineered with heuristics, KNIFT is an +[embedding](https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture) +learned directly from a large number of corresponding local patches extracted +from nearby video frames. This data driven approach implicitly encodes complex, +real-world spatial transformations and lighting changes in the embedding. As a +result, the KNIFT feature descriptor appears to be more robust, not only to +[affine distortions](https://en.wikipedia.org/wiki/Affine_transformation), but +to some degree of +[perspective distortions](https://en.wikipedia.org/wiki/Perspective_distortion_\(photography\)) +as well. + +For more information, please see +[MediaPipe KNIFT: Template-based feature matching](https://developers.googleblog.com/2020/04/mediapipe-knift-template-based-feature-matching.html) +in Google Developers Blog. + +![template_matching_mobile_cpu.gif](https://mediapipe.dev/images/mobile/template_matching_android_cpu.gif) | +:-------------------------------------------------------------------------------------: | +*Fig 2. Matching US dollar bills using KNIFT.* | + +## Example Apps + +### Matching US Dollar Bills + +In MediaPipe, we've already provided an +[index file](https://github.com/google/mediapipe/tree/master/mediapipe/models/knift_index.pb) +pre-computed from the 3 template images (of US dollar bills) shown below. If +you'd like to use your own template images, see +[Matching Your Own Template Images](#matching-your-own-template-images). + +![template_matching_mobile_template.jpg](https://mediapipe.dev/images/mobile/template_matching_mobile_template.jpg) + +Please first see general instructions for +[Android](../getting_started/android.md) on how to build MediaPipe examples. + +Note: To visualize a graph, copy the graph and paste it into +[MediaPipe Visualizer](https://viz.mediapipe.dev/). For more information on how +to visualize its associated subgraphs, please see +[visualizer documentation](../tools/visualizer.md). + +* Graph: + [`mediapipe/graphs/template_matching/template_matching_mobile_cpu.pbtxt`](https://github.com/google/mediapipe/tree/master/mediapipe/graphs/template_matching/template_matching_mobile_cpu.pbtxt) +* Android target: + [(or download prebuilt ARM64 APK)](https://drive.google.com/open?id=1tSWRfes9rAM4NrzmJBplguNQQvaeBZSa) + [`mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu:templatematchingcpu`](https://github.com/google/mediapipe/tree/master/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/BUILD) + +Note: MediaPipe uses OpenCV 3 by default. However, because of +[issues](https://github.com/opencv/opencv/issues/11488) between NDK 17+ and +OpenCV 3 when using +[knnMatch](https://docs.opencv.org/3.4/db/d39/classcv_1_1DescriptorMatcher.html#a378f35c9b1a5dfa4022839a45cdf0e89), +for this example app please use the following commands to temporarily switch to +OpenCV 4, and switch back to OpenCV 3 afterwards. + +```bash +# Switch to OpenCV 4 +sed -i -e 's:3.4.3/opencv-3.4.3:4.0.1/opencv-4.0.1:g' WORKSPACE +sed -i -e 's:libopencv_java3:libopencv_java4:g' third_party/opencv_android.BUILD + +# Build and install app +bazel build -c opt --config=android_arm64 mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu +adb install -r bazel-bin/mediapipe/examples/android/src/java/com/google/mediapipe/apps/templatematchingcpu/templatematchingcpu.apk + +# Switch back to OpenCV 3 +sed -i -e 's:4.0.1/opencv-4.0.1:3.4.3/opencv-3.4.3:g' WORKSPACE +sed -i -e 's:libopencv_java4:libopencv_java3:g' third_party/opencv_android.BUILD +``` + +Tip: The example uses the TFLite +[XNNPACK delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/xnnpack) +by default for faster inference. Users can change the +[option in TfLiteInferenceCalculator](https://github.com/google/mediapipe/tree/master/mediapipe/calculators/tflite/tflite_inference_calculator.proto) +to run regular TFLite inference. + +### Matching Your Own Template Images + +* Step 1: Put all template images in a single directory. + +* Step 2: To build the index file for all templates in the directory, run + + ```bash + bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 \ + mediapipe/examples/desktop/template_matching:template_matching_tflite + ``` + + ```bash + bazel-bin/mediapipe/examples/desktop/template_matching/template_matching_tflite \ + --calculator_graph_config_file=mediapipe/graphs/template_matching/index_building.pbtxt \ + --input_side_packets="file_directory=