Prepare v0.4.0 release (#184)

* update tests license first lines * update project infos and license * bump version + update dependencies * fix quality * fix project description * move notebooks to new folder + inference test scripts
Wordcab · Aug 2, 2023 · ad689f4 · ad689f4
1 parent 93b26fa
commit ad689f4
Show file tree

Hide file tree

Showing 24 changed files with 382 additions and 713 deletions.
diff --git a/.env b/.env
@@ -8,9 +8,9 @@
 # The name of the project, used for API documentation.
 PROJECT_NAME="Wordcab Transcribe"
 # The version of the project, used for API documentation.
-VERSION="0.3.1"
+VERSION="0.4.0"
 # The description of the project, used for API documentation.
-DESCRIPTION="💬 ASR FastAPI server using faster-whisper and NVIDIA NeMo."
+DESCRIPTION="💬 ASR FastAPI server using faster-whisper and Auto-Tuning Spectral Clustering for diarization."
 # This API prefix is used for all endpoints in the API outside of the status and cortex endpoints.
 API_PREFIX="/api/v1"
 # Debug mode for FastAPI. It allows for hot reloading when code changes in development.

diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+* text=auto eol=lf
+notebooks/** linguist-vendored
diff --git a/README.md b/README.md
@@ -1,17 +1,19 @@
 # Wordcab Transcribe 💬
 
-FastAPI based API for transcribing audio files using [`faster-whisper`](https://github.com/guillaumekln/faster-whisper) and [`NVIDIA NeMo`](https://github.com/NVIDIA/NeMo)
+FastAPI based API for transcribing audio files using [`faster-whisper`](https://github.com/guillaumekln/faster-whisper)
+and [Auto-Tuning-Spectral-Clustering](https://arxiv.org/pdf/2003.02405.pdf) for diarization
+(based on this [GitHub implementation](https://github.com/tango4j/Auto-Tuning-Spectral-Clustering)).
 
 More details on this project on this [blog post](https://wordcab.github.io/wordcab-posts/blog/2023/03/31/wordcab-transcribe/).
 
 ## Key features
 
-- 🤗 Open-source: Our project is open-source and based on open-source libraries, allowing you to customize and extend it as needed.
 - ⚡ Fast: The faster-whisper library and CTranslate2 make audio processing incredibly fast compared to other implementations.
 - 🐳 Easy to deploy: You can deploy the project on your workstation or in the cloud using Docker.
 - 🔥 Batch requests: You can transcribe multiple audio files at once because batch requests are implemented in the API.
 - 💸 Cost-effective: As an open-source solution, you won't have to pay for costly ASR platforms.
 - 🫶 Easy-to-use API: With just a few lines of code, you can use the API to transcribe audio files or even YouTube videos.
+- 🤗 Open-source (commercial-use under [WTLv0.1 license](https://github.com/Wordcab/wordcab-transcribe/blob/main/LICENSE), please reach out to `[email protected]`): Our project is open-source and based on open-source libraries, allowing you to customize and extend it as needed until you don't sell this as a hosted service.
 
 ## Requirements
 

diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep
diff --git a/notebooks/async_inference.py b/notebooks/async_inference.py
@@ -0,0 +1,40 @@
+import json
+import aiohttp
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+params = {"url": "https://youtu.be/JZ696sbfPHs"}
+# params = {"url": "https://youtu.be/CNzSJ5SGhqU"}
+# params = {"url": "https://youtu.be/pmjrj_TrOEI"}
+# params = {"url": "https://youtu.be/SVwLEocqK0E"}
+
+data = {
+    "alignment": False,  # Longer processing time but better timestamps
+    "diarization": False,  # Longer processing time but speaker segment attribution
+    "source_lang": "en",  # optional, default is "en"
+    "timestamps": "s",  # optional, default is "s". Can be "s", "ms" or "hms".
+    "use_batch": False,  # optional, default is False
+    "internal_vad": False,  # optional, default is False
+    "word_timestamps": True,  # optional, default is False
+}
+
+async def fetch(session, params):
+    async with session.post(
+        "http://localhost:5001/api/v1/youtube",
+        headers=headers,
+        params=params,
+        data=json.dumps(data),
+    ) as response:
+        return await response.json()
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        responses = await asyncio.gather(
+            *[fetch(session, params) for _ in range(15)]
+        )
+        for response in responses:
+            print(response["audio_duration"])
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
diff --git a/notebooks/local_audio_inference.py b/notebooks/local_audio_inference.py
@@ -0,0 +1,28 @@
+import json
+import requests
+
+
+filepath = "data/short_one_speaker.mp3"
+
+data = {
+  "alignment": False,  # Longer processing time but better timestamps
+  "diarization": True,  # Longer processing time but speaker segment attribution
+  "dual_channel": False,  # Only for stereo audio files with one speaker per channel
+  "source_lang": "ru",  # optional, default is "en"
+  "timestamps": "s",  # optional, default is "s". Can be "s", "ms" or "hms".
+  "word_timestamps": False,  # optional, default is False
+}
+
+with open(filepath, "rb") as f:
+    files = {"file": f}
+    response = requests.post(
+        "http://localhost:5001/api/v1/audio",
+        files=files,
+        data=data,
+    )
+
+r_json = response.json()
+
+filename = filepath.split(".")[0]
+with open(f"{filename}.json", "w", encoding="utf-8") as f:
+  json.dump(r_json, f, indent=4, ensure_ascii=False)
diff --git a/split_diarization.ipynb → notebooks/split_diarization.ipynb b/split_diarization.ipynb → notebooks/split_diarization.ipynb
diff --git a/notebooks/youtube_inference.py b/notebooks/youtube_inference.py
@@ -0,0 +1,35 @@
+import json
+import requests
+
+headers = {"accept": "application/json", "Content-Type": "application/json"}
+# params = {"url": "https://youtu.be/JZ696sbfPHs"}
+# params = {"url": "https://youtu.be/CNzSJ5SGhqU"}
+# params = {"url": "https://youtu.be/vAvcxeXtBz0"}
+# params = {"url": "https://youtu.be/pmjrj_TrOEI"}
+# params = {"url": "https://youtu.be/SVwLEocqK0E"}
+params = {"url": "https://youtu.be/ry9SYnV3svc"}
+# params = {"url": "https://youtu.be/oAhVu3HvWnw"}
+# params = {"url": "https://youtu.be/sfQMxf9Dm8I"}
+# params = {"url": "https://youtu.be/uLBZf9eS4Y0"}
+
+data = {
+  "alignment": False,  # Longer processing time but better timestamps
+  "diarization": True,  # Longer processing time but speaker segment attribution
+  "source_lang": "en",  # optional, default is "en"
+  "timestamps": "s",  # optional, default is "s". Can be "s", "ms" or "hms".
+  "use_batch": False,  # optional, default is False
+  "internal_vad": False,  # optional, default is False
+  "word_timestamps": False,  # optional, default is False
+}
+
+response = requests.post(
+  "http://localhost:5001/api/v1/youtube",
+  headers=headers,
+  params=params,
+  data=json.dumps(data),
+)
+
+r_json = response.json()
+
+with open("data/youtube_output.json", "w", encoding="utf-8") as f:
+  json.dump(r_json, f, indent=4, ensure_ascii=False)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		* text=auto eol=lf
		notebooks/** linguist-vendored