Merge pull request #18 from sean1832/17-bug-failed-to-fetch-data-from…

…-youtube-after-5-attempts fix bug #17: failed to fetch data from youtube
sean1832 · May 2, 2023 · 52530c7 · 52530c7
2 parents e760e96 + 0ad6cd0
commit 52530c7
Show file tree

Hide file tree

Showing 10 changed files with 147 additions and 72 deletions.
diff --git a/.gitignore b/.gitignore
@@ -161,4 +161,5 @@ cython_debug/
 .vscode/
 
 # test folder
-.test/
+.test/
+/test/
diff --git a/requirements.txt b/requirements.txt
@@ -5,8 +5,7 @@ numpy==1.24.2
 openai==0.27.2
 pydub==0.25.1
 PyPDF4==1.27.0
-python_docx==0.8.11
-pytube==12.1.2
+pytube==12.1.3
 streamlit==1.20.0
-streamlit_toggle_switch==1.0.2
-tiktoken==0.3.1
+requests==2.29.0
+youtube_transcript_api==0.6.0
diff --git a/src/Components/Info.py b/src/Components/Info.py
@@ -6,12 +6,13 @@ def info():
     info_panel = st.container()
 
     manifest = 'src/manifest.json'
+    st.session_state['MANIFEST'] = manifest_data = file_io.read_json(manifest)
 
     with info_panel:
         st.markdown('---')
-        st.markdown(f"# {file_io.read_json(manifest, 'name')}")
-        st.markdown(f"Version: `{file_io.read_json(manifest, 'version')}`")
-        st.markdown(f"Author: {file_io.read_json(manifest, 'author')}")
-        st.markdown(f"[Report a bug]({file_io.read_json(manifest, 'bugs')})")
-        st.markdown(f"[GitHub repo]({file_io.read_json(manifest, 'homepage')})")
-        st.markdown(f"License: [{file_io.read_json(manifest, 'license')}](https://github.com/sean1832/SumGPT/blob/master/LICENSE)")
+        st.markdown(f"# {manifest_data['name']}")
+        st.markdown(f"Version: `{manifest_data['version']}`")
+        st.markdown(f"Author: {manifest_data['author']}")
+        st.markdown(f"[Report a bug]({manifest_data['bugs']['url']})")
+        st.markdown(f"[GitHub repo]({manifest_data['homepage']})")
+        st.markdown(f"License: [{manifest_data['license']['type']}]({manifest_data['license']['url']})")
diff --git a/src/Components/StreamlitSetup.py b/src/Components/StreamlitSetup.py
@@ -1,33 +1,36 @@
 import streamlit as st
 import Data.caption_languages as data
+import Modules.file_io as file_io
 
-def setup():
-    st.set_page_config(page_title="SumGPT", page_icon="📝", layout="wide")
+st.set_page_config(page_title="SumGPT", page_icon="📝", layout="wide")
 
-    if not st.session_state.get('OPENAI_API_KEY'):
-        st.session_state['OPENAI_API_KEY'] = None
+if not st.session_state.get('OPENAI_API_KEY'):
+    st.session_state['OPENAI_API_KEY'] = None
 
-    if not st.session_state.get('OPENAI_PERSONA_REC'):
-        st.session_state['OPENAI_PERSONA_REC'] = None
+if not st.session_state.get('OPENAI_PERSONA_REC'):
+    st.session_state['OPENAI_PERSONA_REC'] = None
 
-    if not st.session_state.get('OPENAI_PERSONA_SUM'):
-        st.session_state['OPENAI_PERSONA_SUM'] = None
+if not st.session_state.get('OPENAI_PERSONA_SUM'):
+    st.session_state['OPENAI_PERSONA_SUM'] = None
 
-    if not st.session_state.get('CHUNK_SIZE'):
-        st.session_state['CHUNK_SIZE'] = None
+if not st.session_state.get('CHUNK_SIZE'):
+    st.session_state['CHUNK_SIZE'] = None
 
-    if not st.session_state.get('OPENAI_PARAMS'):
-        st.session_state['OPENAI_PARAMS'] = None
+if not st.session_state.get('OPENAI_PARAMS'):
+    st.session_state['OPENAI_PARAMS'] = None
 
-    if not st.session_state.get('DELAY'):
-        st.session_state['DELAY'] = 0
+if not st.session_state.get('DELAY'):
+    st.session_state['DELAY'] = 0
 
-    if not st.session_state.get('FINAL_SUMMARY_MODE'):
-        st.session_state['FINAL_SUMMARY_MODE'] = False
+if not st.session_state.get('FINAL_SUMMARY_MODE'):
+    st.session_state['FINAL_SUMMARY_MODE'] = False
 
-    if not st.session_state.get('CAPTION_LANGUAGES'):
-        st.session_state['CAPTION_LANGUAGES'] = data.languages + data.auto_languages
+if not st.session_state.get('CAPTION_LANGUAGES'):
+    st.session_state['CAPTION_LANGUAGES'] = data.languages + data.auto_languages
+    print(st.session_state['CAPTION_LANGUAGES'])
 
-    if not st.session_state.get('PREVIOUS_RESULTS'):
-        st.session_state['PREVIOUS_RESULTS'] = None
+if not st.session_state.get('PREVIOUS_RESULTS'):
+    st.session_state['PREVIOUS_RESULTS'] = None
 
+if not st.session_state.get('MANIFEST'):
+    st.session_state["MANIFEST"] = file_io.read_json("src/manifest.json")
diff --git a/src/Data/caption_languages.py b/src/Data/caption_languages.py
@@ -1,5 +1,5 @@
 languages = [
-    'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh-Hans', 'zh-Hant', 'zh-TW', 'zh', 'ar', 'hi', 'th'
+    'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh-Hans', 'zh-Hant', 'zh-TW', 'zh-CN', 'zh', 'ar', 'hi', 'th'
 ]
 
 auto_languages = ['a.' + _language for _language in languages]

diff --git a/src/Modules/Youtube.py b/src/Modules/Youtube.py
@@ -0,0 +1,97 @@
+import requests
+import re
+from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
+import streamlit as st
+from typing import Any, Dict, List, Tuple, Union
+
+
+manifest = st.session_state["MANIFEST"]
+def _error_report_msg(youtube_url):
+    return f"Please create an issue on [GitHub]({manifest['bugs']['url']}). " \
+           f"Please include the YouTube URL ({youtube_url}), version number ({manifest['version']}) " \
+           f"and all necessary information to replicate the error. " \
+           f"**Before creating a new issue, please check if the problem has already been reported.**"
+
+def _extract_video_id_from_url(url):
+    video_id_pattern = r'(?:v=|/v/|youtu\.be/|/embed/|/e/)([^?&"\'>]+)'
+    match = re.search(video_id_pattern, url)
+    if match:
+        return match.group(1)
+    else:
+        raise ValueError("Invalid YouTube URL")
+
+def get_video_title(youtube_url):
+    video_id = _extract_video_id_from_url(youtube_url)
+    url = f'https://www.youtube.com/watch?v={video_id}'
+    response = requests.get(url)
+    title_pattern = r'<title>(.+?) - YouTube<\/title>'
+    match = re.search(title_pattern, response.text)
+    if match:
+        title = match.group(1)
+        return title
+    else:
+        return None
+
+def get_available_subtitle_languages(video_id):
+    try:
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+        languages = [transcript.language_code for transcript in transcript_list]
+        return languages
+    except Exception as e:
+        print(f"Error fetching available subtitle languages: {e}")
+        return []
+
+def get_video_captions(youtube_url, languages):
+    video_id = _extract_video_id_from_url(youtube_url)
+    simplified_url = f'https://www.youtube.com/watch?v={video_id}'
+
+    available_language = get_available_subtitle_languages(video_id)
+
+    if not any(lang in languages for lang in available_language) and available_language != []:
+        print(f"Failed to retrieve transcript: Language {available_language} is/are not yet supported for {simplified_url}.")
+        st.error(f'❌ Language {available_language} is/are not yet supported for {simplified_url}.\n\n' + _error_report_msg(simplified_url))
+        st.stop()
+
+    for language in languages:
+        try:
+            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
+            captions = ""
+            for item in transcript:
+                captions += item['text'] + "\n"
+            return captions
+
+        except NoTranscriptFound as e:
+            if language == languages[-1]:
+                print(f"Language {available_language} exist in language list but failed to retrieve in YouTubeTranscriptApi.get_transcript: {e}")
+                st.error(f'❌ Language {available_language} exist in language list but failed to retrieve in `YouTubeTranscriptApi.get_transcript`:\n\n'
+                         f'languages = {available_language}\n\n'
+                         f'language list = {languages}\n\n'
+                         + _error_report_msg(simplified_url))
+                st.stop()
+            else:
+                continue
+
+        except TranscriptsDisabled:
+            print(f"Failed to retrieve transcript: transcripts disabled for {simplified_url}")
+            st.error(f'❌ Subtitles not available for {simplified_url}! \n\n---'
+                     f'\n**Instruction:**\n\n'
+                     f'1. Verify if the [video]({simplified_url}) has subtitles available.\n\n'
+                     f"2. If you are confident that subtitles are available in the video but could not be retrieved, "
+                     + _error_report_msg(simplified_url))
+            st.stop()
+            raise TranscriptsDisabled
+
+        except Exception as e:
+            print(e)
+            st.error(f'❌ Failed to fetch data from YouTube for {simplified_url}. \n\n'
+                     f'{_error_report_msg(simplified_url)}'
+                     f'\n\nError: \n\n---\n\n{e}')
+            st.stop()
+            break
+
+@st.cache_data(show_spinner=False)
+def extract_youtube_transcript(url: str, lang_code: str | List[str] = 'a.en') -> Tuple[str, str]:
+    """Extracts the transcript from a YouTube video."""
+    transcript = get_video_captions(url, lang_code)
+    title = get_video_title(url)
+    return transcript, title
diff --git a/src/SumGPT.py b/src/SumGPT.py
@@ -1,14 +1,16 @@
 import asyncio
 import Components
 import streamlit as st
+
+import Components.StreamlitSetup
+
+import Modules.Youtube
 from Components.sidebar import sidebar
 import Modules.file_io as file_io
 import GPT
 import util
 import time
 
-Components.StreamlitSetup.setup()
-
 app_header = st.container()
 
 file_handler = st.container()
@@ -36,7 +38,7 @@
     if youtube_link:
         upload_file_emtpy.empty()
         with st.spinner("🔍 Extracting transcript..."):
-            transcript, title = util.extract_youtube_transcript(youtube_link, st.session_state['CAPTION_LANGUAGES'])
+            transcript, title = Modules.Youtube.extract_youtube_transcript(youtube_link, st.session_state['CAPTION_LANGUAGES'])
             file_content = {'name': f"{title}.txt", 'content': transcript}
     elif upload_file:
         youtube_link_empty.empty()

diff --git a/src/manifest.json b/src/manifest.json
@@ -1,12 +1,17 @@
 {
   "name": "SumGPT",
-  "version": "1.0.5",
-  "license": "MIT",
+  "version": "1.0.7",
+  "license": {
+    "type": "MIT",
+    "url": "https://github.com/sean1832/SumGPT/blob/master/LICENSE"
+  },
   "author": "Zeke Zhang",
   "homepage": "https://github.com/sean1832/SumGPT",
   "repository": {
     "type": "git",
     "url": "https://github.com/sean1832/SumGPT"
   },
-  "bugs": "https://github.com/sean1832/SumGPT/issues"
+  "bugs": {
+    "url": "https://github.com/sean1832/SumGPT/issues"
+  }
 }
diff --git a/src/util.py b/src/util.py
@@ -42,7 +42,7 @@ def _extract_xml_caption(xml: str, is_auto_lang: bool) -> str:
         text_content = text
     return text_content.strip()
 
-@st.cache_data(show_spinner=False)
+# @st.cache_data(show_spinner=False)
 def _get_caption(url: str, lang_code: str | List[str] = 'a.en') -> str:
     """Extracts the transcript from a YouTube video."""
     attempt = 3

diff --git a/test/test.py b/test/test.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -161,4 +161,5 @@ cython_debug/ @@
     .vscode/
     # test folder
-    .test/
+    .test/
+    /test/