Hello Everyone,
At first, i want to thank you Deeplearning.ai AND Langchain for their incredible work, thank you very much.
I have one purpose : I’m looking for a chatbot that can retrieve informations from PDFs.
My steps :
- I followed the entire class named “LangChain Chat with Your Data”
- I installed the environment :
- Windows 11 version 23H2 (OS build 22631.3007)
- Python v3.11.7
- Pip freeze result at the end (list modules installed)
- I started to develop my own langchain with GPT4
- Document loading is OK
- Document splitting is OK
- Vectorstores and embedding is OK
- Retrieval informations with "print(doc.page_content[:500]) is OK
My issue :
- Answering my questions is not working, i have no result
- I’m using the code below :
#Preparing for Question Answering
from langchain_openai import ChatOpenAI
#Import QA chain
from langchain.chains import RetrievalQA
llm2 = ChatOpenAI(model_name="gpt-4", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm2,
retriever=vectordb.as_retriever()
)
question = "my question?"
#Test to verify that the information is correctly retrieved
docs = vectordb.max_marginal_relevance_search(question, k=1, fetch_k=3)
print("Number of documents found :", len(docs))
for doc in docs:
#Test printing of the first characters - Working
print('Printing the first characters :',doc.page_content[:500])
#Not working
result = qa_chain.invoke({"query": question})
result["result"]
Could you please help me guys, i tried a lot of things but i’m still have the problem.
I post the full code below :
#Env import
import os
import openai
import shutil
import sys
sys.path.append('../..')
#Load .env
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
#API config
openai.api_key = os.environ['OPENAI_API_KEY']
#Load import PDF module
from langchain_community.document_loaders import PyPDFLoader
#Load PDFs
loaders = [
PyPDFLoader("C:/Users/XXX/OneDrive/Bureau/GPT_socle/test.pdf")
]
docs = []
for loader in loaders:
docs.extend(loader.load())
#Import token splitter to do chunks
from langchain.text_splitter import TokenTextSplitter
#Docs split by Token
#Chunk overlap = Nb of tokens to take from the previous chunk for the new one so as not to lose the context
#Chunk Size = Nb of tokens by requests
token_splitter = TokenTextSplitter(
chunk_size = 500,
chunk_overlap = 150
)
splits = token_splitter.split_documents(docs)
print('Nb of chunks with TokenSplitter : ',len(splits))
#Using OpenAIEmbedding to embbed chunks in an optimized way into the vector
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(openai_api_key=openai.api_key)
#Import chroma vector
from langchain_community.vectorstores import Chroma
#Create persistent folder to stock vector
persist_directory = 'docs/chroma/'
#Check if the persistent folder already exists, if yes delete it
dossier_a_supprimer = './docs/chroma'
if os.path.exists(dossier_a_supprimer):shutil.rmtree(dossier_a_supprimer)
#Use of the Chroma vector, embbed splits, use of Open AI embedding and storage in persistent docs/chroma
vectordb = Chroma.from_documents(
documents=splits,
embedding=embedding,
persist_directory=persist_directory
)
#Checking the number of collections which must be identical to the number of chunks
print('Nb of collection count : ',vectordb._collection.count())
question = "What is the language of the document ?"
#Use of MMR to obtain both the requested information but also other information that could be important / We replace the first variable docs which are the raw PDFS by the vector containing the chunks
docs = vectordb.max_marginal_relevance_search(question,k=1, fetch_k=3)
#Number of documents returned - must be equal to k=x which is the number of most relevant documents we want in our response
print('Number of documents we want in return - Corresponds to K=x - Allows us to check that the MMR is working : ',len(docs))
#Persistence of the vector containing the chunks
vectordb.persist()
#Using Self query retriever to correctly retrieve metadata
from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
metadata_field_info = [
AttributeInfo(
name="source",
description="The lecture the chunk is from, should be one of `C:/Users/XXX/OneDrive/Bureau/GPT_socle/Test.pdf`",
type="string",
),
AttributeInfo(
name="page",
description="The page from the lecture",
type="integer",
),
]
#Description of what's in the docs
document_content_description = "AI and Cybersecurity"
llm = OpenAI(model='gpt-4', temperature=0)
retriever = SelfQueryRetriever.from_llm(
llm,
vectordb,
document_content_description,
metadata_field_info,
verbose=True
)
#Adding meta data to results
for d in docs:
print(d.metadata)
#Impression du contenu de la page pour test
#print('Contenu de la page après SelfQuery :',docs[0].page_content[:100])
#not working
#docs = retriever.get_relevant_documents(question)
#Preparing for Question Answering
from langchain_openai import ChatOpenAI
#Import QA chain
from langchain.chains import RetrievalQA
llm2 = ChatOpenAI(model_name="gpt-4", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm2,
retriever=vectordb.as_retriever()
)
question = "my question?"
#Test to verify that the information is correctly retrieved
docs = vectordb.max_marginal_relevance_search(question, k=1, fetch_k=3)
print("Number of documents found :", len(docs))
for doc in docs:
#Test printing of the first characters - Working
print('Printing the first characters :',doc.page_content[:500])
#Not working
result = qa_chain.invoke({"query": question})
result["result"]
-------------------------
Pip freeze :
aiohttp==3.9.1
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.2.0
asgiref==3.7.2
attrs==23.2.0
backoff==2.2.1
bcrypt==4.1.2
build==1.0.3
cachetools==5.3.2
certifi==2023.11.17
charset-normalizer==3.3.2
chroma-hnswlib==0.7.3
chromadb==0.4.22
click==8.1.7
colorama==0.4.6
coloredlogs==15.0.1
dataclasses-json==0.6.3
Deprecated==1.2.14
distro==1.9.0
fastapi==0.108.0
filelock==3.13.1
flatbuffers==23.5.26
frozenlist==1.4.1
fsspec==2023.12.2
google-auth==2.26.1
googleapis-common-protos==1.62.0
greenlet==3.0.3
grpcio==1.60.0
h11==0.14.0
httpcore==1.0.2
httptools==0.6.1
httpx==0.26.0
huggingface-hub==0.20.2
humanfriendly==10.0
idna==3.6
importlib-metadata==6.11.0
importlib-resources==6.1.1
jsonpatch==1.33
jsonpointer==2.4
kubernetes==29.0.0
langchain==0.1.0
langchain-community==0.0.11
langchain-core==0.1.9
langchain-openai==0.0.2
langsmith==0.0.79
lark==1.1.9
marshmallow==3.20.2
mmh3==4.1.0
monotonic==1.6
mpmath==1.3.0
multidict==6.0.4
mypy-extensions==1.0.0
numpy==1.26.3
oauthlib==3.2.2
onnxruntime==1.16.3
openai==1.7.0
opentelemetry-api==1.22.0
opentelemetry-exporter-otlp-proto-common==1.22.0
opentelemetry-exporter-otlp-proto-grpc==1.22.0
opentelemetry-instrumentation==0.43b0
opentelemetry-instrumentation-asgi==0.43b0
opentelemetry-instrumentation-fastapi==0.43b0
opentelemetry-proto==1.22.0
opentelemetry-sdk==1.22.0
opentelemetry-semantic-conventions==0.43b0
opentelemetry-util-http==0.43b0
overrides==7.4.0
packaging==23.2
posthog==3.3.1
protobuf==4.25.1
pulsar-client==3.4.0
pyasn1==0.5.1
pyasn1-modules==0.3.0
pydantic==2.5.3
pydantic_core==2.14.6
pypdf==3.17.4
PyPika==0.48.9
pyproject_hooks==1.0.0
pyreadline3==3.4.1
python-dateutil==2.8.2
python-dotenv==1.0.0
PyYAML==6.0.1
regex==2023.12.25
requests==2.31.0
requests-oauthlib==1.3.1
rsa==4.9
six==1.16.0
sniffio==1.3.0
SQLAlchemy==2.0.25
starlette==0.32.0.post1
sympy==1.12
tenacity==8.2.3
tiktoken==0.5.2
tokenizers==0.15.0
tqdm==4.66.1
typer==0.9.0
typing-inspect==0.9.0
typing_extensions==4.9.0
urllib3==2.1.0
uvicorn==0.25.0
watchfiles==0.21.0
websocket-client==1.7.0
websockets==12.0
wrapt==1.16.0
yarl==1.9.4
zipp==3.17.0
Thank you very much in advance.
Damien