See also the LangChain cookbook.
Below is an example of creating a Chroma vector database using a TextSplitter with a simple text string.
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
load_dotenv()
# https://platform.openai.com/docs/guides/embeddings/use-cases
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=10, # default: 4000
chunk_overlap=2 # default: 200
)
texts = text_splitter.create_documents(
texts=['Often times your document is too long (like a book) for your LLM. You need to split it up into chunks. Text splitters help with this.'])
vectordb = Chroma.from_documents(
documents=texts, embedding=embeddings, persist_directory="./dbtest")
You can also include PDF documents in your Chroma database. See the code below.
import os
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
load_dotenv()
if not any(file.endswith('.pdf') for file in os.listdir('.')):
exit(1)
# https://platform.openai.com/docs/guides/embeddings/use-cases
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000, # default: 4000
chunk_overlap=100 # default: 200
)
loader = DirectoryLoader(".", glob='./*.pdf', loader_cls=PyPDFLoader)
documents = loader.load()
texts = text_splitter.split_documents(documents)
vectordb = Chroma.from_documents(
documents=texts, embedding=embeddings, persist_directory="./dbtest")
