import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
import os
# function to extract text from a webpage and all its subpages
def extract_text_from_page(url):
# Send a GET request to the page
= requests.get(url)
response
# Parse the HTML using Beautiful Soup
= BeautifulSoup(response.text, "html.parser")
soup
# Extract all the text from the page
= ""
text for paragraph in soup.find_all("p"):
+= paragraph.get_text() + "\n\n"
text
# Extract the musician name from the URL
= urlparse(url).path.split("/")[-2]
musician_name
# Write the musician's biography to a text file
= 'berlin'
folder_path =True)
os.makedirs(folder_path, exist_ok= os.path.join(folder_path, musician_name + '.txt')
file_path with open(file_path, "w", encoding="utf-8") as f:
f.write(text)
# Recursively extract text from all sub-pages
for link in soup.find_all("a", href=True):
= link.get("href")
subpage_url if not subpage_url.startswith("http"):
= urljoin(url, subpage_url)
subpage_url if "berliner-philharmoniker.de/en/orchestra/musician" in subpage_url:
extract_text_from_page(subpage_url)
# Extract text from the main page and all sub-pages
= "https://www.berliner-philharmoniker.de/en/orchestra/"
url extract_text_from_page(url)
Introduction
This little project showcases the intersection of web scraping and natural language processing technologies to create a useful and engaging tool for website visitors.
It involves using Beautiful Soup to extract content from the Berliner Philharmoniker website, Langchain to connect to OpenAI’s API, and Gradio to build a nice GUI chatbot. The scrapped content will then be used to train a GPT-powered chatbot that can interact with website visitors in a natural language conversation.
By scraping the website, you can gather information such as concert schedules, artist biographies, and historical archives. This data will be fed into the GPT model, which will learn from the patterns and structures of the content, to generate responses to user queries.
There are three simple steps to build the chatbot:
1. Getting the content of the website with BeautifulSoup
2. Building the database: Embeddings, Vectorize, Chroma
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, TextLoader
import os
'OPENAI_API_KEY'] = ''
os.environ[
= DirectoryLoader(
loader 'berlin',
="*.txt"
glob
)
= loader.load()
documents
= CharacterTextSplitter(
text_splitter =1024,
chunk_size=128,
chunk_overlap
)= text_splitter.split_documents(documents)
texts
= "db"
persist_directory
= OpenAIEmbeddings()
embeddings = Chroma.from_documents(
vectordb =texts,
documents=embeddings,
embedding=persist_directory,
persist_directory
) vectordb.persist()
3. Running the bot with a Gradio GUI
import gradio as gr
import random
import time
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
#from langchain.llms import ConversationalRetrievalChain
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms import OpenAI
import os
'OPENAI_API_KEY'] = ''
os.environ[
= 'db'
persist_directory
= OpenAIEmbeddings()
embeddings
= Chroma(
db =persist_directory,
persist_directory=embeddings,
embedding_function
)
= ConversationBufferMemory(memory_key="chat_history", return_messages=False,
memory
)
= ConversationalRetrievalChain.from_llm(
qa =OpenAI(temperature=0.2, max_tokens=-1),
llm="stuff",
chain_type=db.as_retriever(),
retriever=memory,
memory=lambda h: h,
get_chat_history=True,
verbose=4097,
max_tokens_limit
)
with gr.Blocks() as demo:
= gr.Chatbot()
chatbot = gr.Textbox()
msg = gr.Button("Clear")
clear
def user(user_message, history):
return "", history + [[user_message, None]]
def bot(history):
= qa.run({"question": history[-1][0], "chat_history": history[:-1]})
bot_message -1][1] = bot_message
history[1)
time.sleep(return history
=False).then(
msg.submit(user, [msg, chatbot], [msg, chatbot], queue
bot, chatbot, chatbot
)lambda: None, None, chatbot, queue=False)
clear.click(
demo.launch()