See also the LangChain cookbook.
Below is an example of creating a Chroma vector database using a TextSplitter with a simple text string.
from dotenv import load_dotenv from langchain.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings load_dotenv() # https://platform.openai.com/docs/guides/embeddings/use-cases embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") text_splitter = RecursiveCharacterTextSplitter( chunk_size=10, # default: 4000 chunk_overlap=2 # default: 200 ) texts = text_splitter.create_documents( texts=['Often times your document is too long (like a book) for your LLM. You need to split it up into chunks. Text splitters help with this.']) vectordb = Chroma.from_documents( documents=texts, embedding=embeddings, persist_directory="./dbtest")
You can also include PDF documents in your Chroma database. See the code below.
import os from dotenv import load_dotenv from langchain.vectorstores import Chroma from langchain.document_loaders import DirectoryLoader from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings load_dotenv() if not any(file.endswith('.pdf') for file in os.listdir('.')): exit(1) # https://platform.openai.com/docs/guides/embeddings/use-cases embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") text_splitter = RecursiveCharacterTextSplitter( chunk_size=2000, # default: 4000 chunk_overlap=100 # default: 200 ) loader = DirectoryLoader(".", glob='./*.pdf', loader_cls=PyPDFLoader) documents = loader.load() texts = text_splitter.split_documents(documents) vectordb = Chroma.from_documents( documents=texts, embedding=embeddings, persist_directory="./dbtest")