GPT-Powered Conversational Bot for a Website

Scraping the Berliner Philharmoniker website and using the content to create a GPT-powered chatbot for the website.
gpt
langchain
chatbot
beautifulsoup
Gradio
Author

Shai Nisan

Published

April 13, 2023

Introduction

This little project showcases the intersection of web scraping and natural language processing technologies to create a useful and engaging tool for website visitors.

It involves using Beautiful Soup to extract content from the Berliner Philharmoniker website, Langchain to connect to OpenAI’s API, and Gradio to build a nice GUI chatbot. The scrapped content will then be used to train a GPT-powered chatbot that can interact with website visitors in a natural language conversation.

By scraping the website, you can gather information such as concert schedules, artist biographies, and historical archives. This data will be fed into the GPT model, which will learn from the patterns and structures of the content, to generate responses to user queries.

There are three simple steps to build the chatbot:

1. Getting the content of the website with BeautifulSoup

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
import os

# function to extract text from a webpage and all its subpages
def extract_text_from_page(url):
    # Send a GET request to the page
    response = requests.get(url)

    # Parse the HTML using Beautiful Soup
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract all the text from the page
    text = ""
    for paragraph in soup.find_all("p"):
        text += paragraph.get_text() + "\n\n"

    # Extract the musician name from the URL
    musician_name = urlparse(url).path.split("/")[-2]

    # Write the musician's biography to a text file

    folder_path = 'berlin'
    os.makedirs(folder_path, exist_ok=True)
    file_path = os.path.join(folder_path, musician_name + '.txt')
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(text)

    # Recursively extract text from all sub-pages
    for link in soup.find_all("a", href=True):
        subpage_url = link.get("href")
        if not subpage_url.startswith("http"):
            subpage_url = urljoin(url, subpage_url)
        if "berliner-philharmoniker.de/en/orchestra/musician" in subpage_url:
            extract_text_from_page(subpage_url)

# Extract text from the main page and all sub-pages
url = "https://www.berliner-philharmoniker.de/en/orchestra/"
extract_text_from_page(url)

2. Building the database: Embeddings, Vectorize, Chroma

from langchain.vectorstores.chroma import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

from langchain.document_loaders import DirectoryLoader, TextLoader

import os

os.environ['OPENAI_API_KEY'] = ''

loader = DirectoryLoader(
    'berlin', 
    glob="*.txt"
    )

documents = loader.load()

text_splitter = CharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=128,
)
texts = text_splitter.split_documents(documents)

persist_directory = "db"

embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    persist_directory=persist_directory,
)
vectordb.persist()

3. Running the bot with a Gradio GUI

import gradio as gr
import random
import time

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
#from langchain.llms import ConversationalRetrievalChain
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms import OpenAI


import os

os.environ['OPENAI_API_KEY'] = ''


persist_directory = 'db'

embeddings = OpenAIEmbeddings()

db = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings,
)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=False,
)

qa = ConversationalRetrievalChain.from_llm(
    llm=OpenAI(temperature=0.2, max_tokens=-1),
    chain_type="stuff",
    retriever=db.as_retriever(),
    memory=memory,
    get_chat_history=lambda h: h,
    verbose=True,
    max_tokens_limit=4097,
)

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        bot_message = qa.run({"question": history[-1][0], "chat_history": history[:-1]})
        history[-1][1] = bot_message
        time.sleep(1)
        return history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()