Q&A module not working

Hello Everyone,

At first, i want to thank you Deeplearning.ai AND Langchain for their incredible work, thank you very much.

I have one purpose : I’m looking for a chatbot that can retrieve informations from PDFs.

My steps :

  • I followed the entire class named “LangChain Chat with Your Data”
  • I installed the environment :
  • Windows 11 version 23H2 (OS build 22631.3007)
  • Python v3.11.7
  • Pip freeze result at the end (list modules installed)
  • I started to develop my own langchain with GPT4
  • Document loading is OK
  • Document splitting is OK
  • Vectorstores and embedding is OK
  • Retrieval informations with "print(doc.page_content[:500]) is OK

My issue :

  • Answering my questions is not working, i have no result
  • I’m using the code below :

#Preparing for Question Answering
from langchain_openai import ChatOpenAI
#Import QA chain 
from langchain.chains import RetrievalQA

llm2 = ChatOpenAI(model_name="gpt-4", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm2,
    retriever=vectordb.as_retriever()
)

question = "my question?"

#Test to verify that the information is correctly retrieved
docs = vectordb.max_marginal_relevance_search(question, k=1, fetch_k=3)
print("Number of documents found :", len(docs))
for doc in docs:
#Test printing of the first characters - Working    
    print('Printing the first characters :',doc.page_content[:500])

#Not working
result = qa_chain.invoke({"query": question})
result["result"]

Could you please help me guys, i tried a lot of things but i’m still have the problem.

I post the full code below :

#Env import
import os
import openai
import shutil
import sys
sys.path.append('../..')

#Load .env
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

#API config
openai.api_key  = os.environ['OPENAI_API_KEY']

#Load import PDF module
from langchain_community.document_loaders import PyPDFLoader

#Load PDFs
loaders = [
            PyPDFLoader("C:/Users/XXX/OneDrive/Bureau/GPT_socle/test.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

#Import token splitter to do chunks
from langchain.text_splitter import TokenTextSplitter

#Docs split by Token
#Chunk overlap = Nb of tokens to take from the previous chunk for the new one so as not to lose the context
#Chunk Size = Nb of tokens by requests
token_splitter = TokenTextSplitter(
    chunk_size = 500,
    chunk_overlap = 150
)
splits = token_splitter.split_documents(docs)
print('Nb of chunks with TokenSplitter : ',len(splits))

#Using OpenAIEmbedding to embbed chunks in an optimized way into the vector
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(openai_api_key=openai.api_key)

#Import chroma vector
from langchain_community.vectorstores import Chroma

#Create persistent folder to stock vector
persist_directory = 'docs/chroma/'

#Check if the persistent folder already exists, if yes delete it
dossier_a_supprimer = './docs/chroma'
if os.path.exists(dossier_a_supprimer):shutil.rmtree(dossier_a_supprimer)

#Use of the Chroma vector, embbed splits, use of Open AI embedding and storage in persistent docs/chroma
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

#Checking the number of collections which must be identical to the number of chunks
print('Nb of collection count : ',vectordb._collection.count())

question = "What is the language of the document ?"

#Use of MMR to obtain both the requested information but also other information that could be important / We replace the first variable docs which are the raw PDFS by the vector containing the chunks
docs = vectordb.max_marginal_relevance_search(question,k=1, fetch_k=3)

#Number of documents returned - must be equal to k=x which is the number of most relevant documents we want in our response
print('Number of documents we want in return - Corresponds to K=x - Allows us to check that the MMR is working : ',len(docs))

#Persistence of the vector containing the chunks
vectordb.persist()

#Using Self query retriever to correctly retrieve metadata
from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `C:/Users/XXX/OneDrive/Bureau/GPT_socle/Test.pdf`",
        type="string",
    ),
   AttributeInfo(
        name="page",
       description="The page from the lecture",
        type="integer",
    ),
]

#Description of what's in the docs
document_content_description = "AI and Cybersecurity"
llm = OpenAI(model='gpt-4', temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

#Adding meta data to results
for d in docs:
    print(d.metadata)

#Impression du contenu de la page pour test
#print('Contenu de la page après SelfQuery :',docs[0].page_content[:100])

#not working
#docs = retriever.get_relevant_documents(question)

#Preparing for Question Answering
from langchain_openai import ChatOpenAI
#Import QA chain 
from langchain.chains import RetrievalQA

llm2 = ChatOpenAI(model_name="gpt-4", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm2,
    retriever=vectordb.as_retriever()
)

question = "my question?"

#Test to verify that the information is correctly retrieved
docs = vectordb.max_marginal_relevance_search(question, k=1, fetch_k=3)
print("Number of documents found :", len(docs))
for doc in docs:
#Test printing of the first characters - Working   
    print('Printing the first characters :',doc.page_content[:500])

#Not working
result = qa_chain.invoke({"query": question})
result["result"]
-------------------------

Pip freeze :

aiohttp==3.9.1
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.2.0
asgiref==3.7.2
attrs==23.2.0
backoff==2.2.1
bcrypt==4.1.2
build==1.0.3
cachetools==5.3.2
certifi==2023.11.17
charset-normalizer==3.3.2
chroma-hnswlib==0.7.3
chromadb==0.4.22
click==8.1.7
colorama==0.4.6
coloredlogs==15.0.1
dataclasses-json==0.6.3
Deprecated==1.2.14
distro==1.9.0
fastapi==0.108.0
filelock==3.13.1
flatbuffers==23.5.26
frozenlist==1.4.1
fsspec==2023.12.2
google-auth==2.26.1
googleapis-common-protos==1.62.0
greenlet==3.0.3
grpcio==1.60.0
h11==0.14.0
httpcore==1.0.2
httptools==0.6.1
httpx==0.26.0
huggingface-hub==0.20.2
humanfriendly==10.0
idna==3.6
importlib-metadata==6.11.0
importlib-resources==6.1.1
jsonpatch==1.33
jsonpointer==2.4
kubernetes==29.0.0
langchain==0.1.0
langchain-community==0.0.11
langchain-core==0.1.9
langchain-openai==0.0.2
langsmith==0.0.79
lark==1.1.9
marshmallow==3.20.2
mmh3==4.1.0
monotonic==1.6
mpmath==1.3.0
multidict==6.0.4
mypy-extensions==1.0.0
numpy==1.26.3
oauthlib==3.2.2
onnxruntime==1.16.3
openai==1.7.0
opentelemetry-api==1.22.0
opentelemetry-exporter-otlp-proto-common==1.22.0
opentelemetry-exporter-otlp-proto-grpc==1.22.0
opentelemetry-instrumentation==0.43b0
opentelemetry-instrumentation-asgi==0.43b0
opentelemetry-instrumentation-fastapi==0.43b0
opentelemetry-proto==1.22.0
opentelemetry-sdk==1.22.0
opentelemetry-semantic-conventions==0.43b0
opentelemetry-util-http==0.43b0
overrides==7.4.0
packaging==23.2
posthog==3.3.1
protobuf==4.25.1
pulsar-client==3.4.0
pyasn1==0.5.1
pyasn1-modules==0.3.0
pydantic==2.5.3
pydantic_core==2.14.6
pypdf==3.17.4
PyPika==0.48.9
pyproject_hooks==1.0.0
pyreadline3==3.4.1
python-dateutil==2.8.2
python-dotenv==1.0.0
PyYAML==6.0.1
regex==2023.12.25
requests==2.31.0
requests-oauthlib==1.3.1
rsa==4.9
six==1.16.0
sniffio==1.3.0
SQLAlchemy==2.0.25
starlette==0.32.0.post1
sympy==1.12
tenacity==8.2.3
tiktoken==0.5.2
tokenizers==0.15.0
tqdm==4.66.1
typer==0.9.0
typing-inspect==0.9.0
typing_extensions==4.9.0
urllib3==2.1.0
uvicorn==0.25.0
watchfiles==0.21.0
websocket-client==1.7.0
websockets==12.0
wrapt==1.16.0
yarl==1.9.4
zipp==3.17.0

Thank you very much in advance.

Damien