Skip to content

Commit

Permalink
Merge pull request #18 from sean1832/17-bug-failed-to-fetch-data-from…
Browse files Browse the repository at this point in the history
…-youtube-after-5-attempts

fix bug #17:  failed to fetch data from youtube
  • Loading branch information
sean1832 authored May 2, 2023
2 parents e760e96 + 0ad6cd0 commit 52530c7
Show file tree
Hide file tree
Showing 10 changed files with 147 additions and 72 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,4 +161,5 @@ cython_debug/
.vscode/

# test folder
.test/
.test/
/test/
7 changes: 3 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ numpy==1.24.2
openai==0.27.2
pydub==0.25.1
PyPDF4==1.27.0
python_docx==0.8.11
pytube==12.1.2
pytube==12.1.3
streamlit==1.20.0
streamlit_toggle_switch==1.0.2
tiktoken==0.3.1
requests==2.29.0
youtube_transcript_api==0.6.0
13 changes: 7 additions & 6 deletions src/Components/Info.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ def info():
info_panel = st.container()

manifest = 'src/manifest.json'
st.session_state['MANIFEST'] = manifest_data = file_io.read_json(manifest)

with info_panel:
st.markdown('---')
st.markdown(f"# {file_io.read_json(manifest, 'name')}")
st.markdown(f"Version: `{file_io.read_json(manifest, 'version')}`")
st.markdown(f"Author: {file_io.read_json(manifest, 'author')}")
st.markdown(f"[Report a bug]({file_io.read_json(manifest, 'bugs')})")
st.markdown(f"[GitHub repo]({file_io.read_json(manifest, 'homepage')})")
st.markdown(f"License: [{file_io.read_json(manifest, 'license')}](https://github.com/sean1832/SumGPT/blob/master/LICENSE)")
st.markdown(f"# {manifest_data['name']}")
st.markdown(f"Version: `{manifest_data['version']}`")
st.markdown(f"Author: {manifest_data['author']}")
st.markdown(f"[Report a bug]({manifest_data['bugs']['url']})")
st.markdown(f"[GitHub repo]({manifest_data['homepage']})")
st.markdown(f"License: [{manifest_data['license']['type']}]({manifest_data['license']['url']})")
43 changes: 23 additions & 20 deletions src/Components/StreamlitSetup.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,36 @@
import streamlit as st
import Data.caption_languages as data
import Modules.file_io as file_io

def setup():
st.set_page_config(page_title="SumGPT", page_icon="📝", layout="wide")
st.set_page_config(page_title="SumGPT", page_icon="📝", layout="wide")

if not st.session_state.get('OPENAI_API_KEY'):
st.session_state['OPENAI_API_KEY'] = None
if not st.session_state.get('OPENAI_API_KEY'):
st.session_state['OPENAI_API_KEY'] = None

if not st.session_state.get('OPENAI_PERSONA_REC'):
st.session_state['OPENAI_PERSONA_REC'] = None
if not st.session_state.get('OPENAI_PERSONA_REC'):
st.session_state['OPENAI_PERSONA_REC'] = None

if not st.session_state.get('OPENAI_PERSONA_SUM'):
st.session_state['OPENAI_PERSONA_SUM'] = None
if not st.session_state.get('OPENAI_PERSONA_SUM'):
st.session_state['OPENAI_PERSONA_SUM'] = None

if not st.session_state.get('CHUNK_SIZE'):
st.session_state['CHUNK_SIZE'] = None
if not st.session_state.get('CHUNK_SIZE'):
st.session_state['CHUNK_SIZE'] = None

if not st.session_state.get('OPENAI_PARAMS'):
st.session_state['OPENAI_PARAMS'] = None
if not st.session_state.get('OPENAI_PARAMS'):
st.session_state['OPENAI_PARAMS'] = None

if not st.session_state.get('DELAY'):
st.session_state['DELAY'] = 0
if not st.session_state.get('DELAY'):
st.session_state['DELAY'] = 0

if not st.session_state.get('FINAL_SUMMARY_MODE'):
st.session_state['FINAL_SUMMARY_MODE'] = False
if not st.session_state.get('FINAL_SUMMARY_MODE'):
st.session_state['FINAL_SUMMARY_MODE'] = False

if not st.session_state.get('CAPTION_LANGUAGES'):
st.session_state['CAPTION_LANGUAGES'] = data.languages + data.auto_languages
if not st.session_state.get('CAPTION_LANGUAGES'):
st.session_state['CAPTION_LANGUAGES'] = data.languages + data.auto_languages
print(st.session_state['CAPTION_LANGUAGES'])

if not st.session_state.get('PREVIOUS_RESULTS'):
st.session_state['PREVIOUS_RESULTS'] = None
if not st.session_state.get('PREVIOUS_RESULTS'):
st.session_state['PREVIOUS_RESULTS'] = None

if not st.session_state.get('MANIFEST'):
st.session_state["MANIFEST"] = file_io.read_json("src/manifest.json")
2 changes: 1 addition & 1 deletion src/Data/caption_languages.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
languages = [
'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh-Hans', 'zh-Hant', 'zh-TW', 'zh', 'ar', 'hi', 'th'
'en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh-Hans', 'zh-Hant', 'zh-TW', 'zh-CN', 'zh', 'ar', 'hi', 'th'
]

auto_languages = ['a.' + _language for _language in languages]
Expand Down
97 changes: 97 additions & 0 deletions src/Modules/Youtube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import requests
import re
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import streamlit as st
from typing import Any, Dict, List, Tuple, Union


manifest = st.session_state["MANIFEST"]
def _error_report_msg(youtube_url):
return f"Please create an issue on [GitHub]({manifest['bugs']['url']}). " \
f"Please include the YouTube URL ({youtube_url}), version number ({manifest['version']}) " \
f"and all necessary information to replicate the error. " \
f"**Before creating a new issue, please check if the problem has already been reported.**"

def _extract_video_id_from_url(url):
video_id_pattern = r'(?:v=|/v/|youtu\.be/|/embed/|/e/)([^?&"\'>]+)'
match = re.search(video_id_pattern, url)
if match:
return match.group(1)
else:
raise ValueError("Invalid YouTube URL")

def get_video_title(youtube_url):
video_id = _extract_video_id_from_url(youtube_url)
url = f'https://www.youtube.com/watch?v={video_id}'
response = requests.get(url)
title_pattern = r'<title>(.+?) - YouTube<\/title>'
match = re.search(title_pattern, response.text)
if match:
title = match.group(1)
return title
else:
return None

def get_available_subtitle_languages(video_id):
try:
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
languages = [transcript.language_code for transcript in transcript_list]
return languages
except Exception as e:
print(f"Error fetching available subtitle languages: {e}")
return []

def get_video_captions(youtube_url, languages):
video_id = _extract_video_id_from_url(youtube_url)
simplified_url = f'https://www.youtube.com/watch?v={video_id}'

available_language = get_available_subtitle_languages(video_id)

if not any(lang in languages for lang in available_language) and available_language != []:
print(f"Failed to retrieve transcript: Language {available_language} is/are not yet supported for {simplified_url}.")
st.error(f'❌ Language {available_language} is/are not yet supported for {simplified_url}.\n\n' + _error_report_msg(simplified_url))
st.stop()

for language in languages:
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
captions = ""
for item in transcript:
captions += item['text'] + "\n"
return captions

except NoTranscriptFound as e:
if language == languages[-1]:
print(f"Language {available_language} exist in language list but failed to retrieve in YouTubeTranscriptApi.get_transcript: {e}")
st.error(f'❌ Language {available_language} exist in language list but failed to retrieve in `YouTubeTranscriptApi.get_transcript`:\n\n'
f'languages = {available_language}\n\n'
f'language list = {languages}\n\n'
+ _error_report_msg(simplified_url))
st.stop()
else:
continue

except TranscriptsDisabled:
print(f"Failed to retrieve transcript: transcripts disabled for {simplified_url}")
st.error(f'❌ Subtitles not available for {simplified_url}! \n\n---'
f'\n**Instruction:**\n\n'
f'1. Verify if the [video]({simplified_url}) has subtitles available.\n\n'
f"2. If you are confident that subtitles are available in the video but could not be retrieved, "
+ _error_report_msg(simplified_url))
st.stop()
raise TranscriptsDisabled

except Exception as e:
print(e)
st.error(f'❌ Failed to fetch data from YouTube for {simplified_url}. \n\n'
f'{_error_report_msg(simplified_url)}'
f'\n\nError: \n\n---\n\n{e}')
st.stop()
break

@st.cache_data(show_spinner=False)
def extract_youtube_transcript(url: str, lang_code: str | List[str] = 'a.en') -> Tuple[str, str]:
"""Extracts the transcript from a YouTube video."""
transcript = get_video_captions(url, lang_code)
title = get_video_title(url)
return transcript, title
8 changes: 5 additions & 3 deletions src/SumGPT.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import asyncio
import Components
import streamlit as st

import Components.StreamlitSetup

import Modules.Youtube
from Components.sidebar import sidebar
import Modules.file_io as file_io
import GPT
import util
import time

Components.StreamlitSetup.setup()

app_header = st.container()

file_handler = st.container()
Expand Down Expand Up @@ -36,7 +38,7 @@
if youtube_link:
upload_file_emtpy.empty()
with st.spinner("🔍 Extracting transcript..."):
transcript, title = util.extract_youtube_transcript(youtube_link, st.session_state['CAPTION_LANGUAGES'])
transcript, title = Modules.Youtube.extract_youtube_transcript(youtube_link, st.session_state['CAPTION_LANGUAGES'])
file_content = {'name': f"{title}.txt", 'content': transcript}
elif upload_file:
youtube_link_empty.empty()
Expand Down
11 changes: 8 additions & 3 deletions src/manifest.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
{
"name": "SumGPT",
"version": "1.0.5",
"license": "MIT",
"version": "1.0.7",
"license": {
"type": "MIT",
"url": "https://github.com/sean1832/SumGPT/blob/master/LICENSE"
},
"author": "Zeke Zhang",
"homepage": "https://github.com/sean1832/SumGPT",
"repository": {
"type": "git",
"url": "https://github.com/sean1832/SumGPT"
},
"bugs": "https://github.com/sean1832/SumGPT/issues"
"bugs": {
"url": "https://github.com/sean1832/SumGPT/issues"
}
}
2 changes: 1 addition & 1 deletion src/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def _extract_xml_caption(xml: str, is_auto_lang: bool) -> str:
text_content = text
return text_content.strip()

@st.cache_data(show_spinner=False)
# @st.cache_data(show_spinner=False)
def _get_caption(url: str, lang_code: str | List[str] = 'a.en') -> str:
"""Extracts the transcript from a YouTube video."""
attempt = 3
Expand Down
33 changes: 0 additions & 33 deletions test/test.py

This file was deleted.

0 comments on commit 52530c7

Please sign in to comment.