RAG
Here’s the corrected and working version of your snippet to read a PDF, extract text, and split it into chunks using LangChain’s RecursiveCharacterTextSplitter:
######################################
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
# Load the PDF file
pdf = PdfReader('./pdf-docs/oci-ai-foundations.pdf')
# Extract text from all pages
text = ""
for page in pdf.pages:
text += page.extract_text() # <-- add parentheses to call the method
# Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # you can adjust this value
chunk_overlap=100, # overlap between chunks
separators=["\n\n", "\n", " ", ""]
)
# Step 4: Split the text using the splitter
chunks = text_splitter.split_text(text)
# Create document objects
# Step 5: Convert each text chunk into a LangChain Document object.
# Each Document stores both the text (page_content) and optional metadata (like page number, source, etc.)
docs = []
for i, chunk in enumerate(chunks):
docs.append(Document(
page_content=chunk,
metadata={"source": "oci-ai-foundations.pdf", "chunk_index": i}
))
######################################
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import OracleVectorStore
def store_to_oracle_vector(
chunks: list[str],
source_file: str,
embedding_model: Embeddings,
connection_string: str,
collection_name: str = "oci_ai_docs"
):
"""
Convert text chunks into Documents, embed them, and store them in Oracle 23 AI Vector Store.
Parameters
----------
chunks : list[str]
The list of text chunks extracted from your PDF or other sources.
source_file : str
Name or path of the source document (used in metadata).
embedding_model : Embeddings
Any LangChain-compatible embedding model (e.g., OpenAIEmbeddings, HuggingFaceEmbeddings).
connection_string : str
Oracle Database connection string in the format:
oracle+oracledb://user:password@hostname:port/service_name
collection_name : str, optional
Logical name of the vector collection in Oracle (default "oci_ai_docs").
Returns
-------
OracleVectorStore
The initialized and populated Oracle Vector Store object.
"""
# Step 1: Create Document objects with metadata
# Create an empty list to hold all Document objects
docs = []
# Loop through each text chunk and its index
for i, chunk in enumerate(chunks):
# Create a Document with content and metadata
doc = Document(
page_content=chunk, # the text content of the chunk
metadata={
"source": source_file, # original file name or path
"chunk_index": i, # order of the chunk in the document
"doc_type": "pdf" # type of file (can adjust as needed)
}
)
# Add the Document to the list
docs.append(doc)
# Now 'docs' holds all Document objects
# Step 2: Initialize Oracle Vector Store and store documents
vector_store = OracleVectorStore.from_documents(
documents=docs,
embedding=embedding_model,
collection_name=collection_name,
connection_string=connection_string
)
print(f"✅ Stored {len(docs)} chunks from '{source_file}' into Oracle Vector Store collection '{collection_name}'.")
return vector_store
USAGE
from langchain_openai import OpenAIEmbeddings
# Example: create embeddings and store chunks
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
connection = "oracle+oracledb://admin:MyPass@myhost:1521/myservice"
store_to_oracle_vector(
chunks=chunks,
source_file="oci-ai-foundations.pdf",
embedding_model=embedding_model,
connection_string=connection,
collection_name="ai_docs_demo"
)
# Optional: print the first chunk to verify
print(docs[0].page_content)
🧩 Explanation
-
PdfReaderreads your PDF file. -
extract_text()pulls out all the text from each page. -
RecursiveCharacterTextSplitterdivides the big text into smaller chunks that are easier for AI models to process later. -
Documentobjects store each chunk, which you can later feed into embedding or question-answering models.
LANGCHAIN Basics
full LangChain basics demo, but this time using Oracle’s OCIChatGenAI instead of OpenAI’s ChatOpenAI.
This shows you how to build:
✅ A simple text invocation
✅ A PromptTemplate with variables
✅ A ChatPromptTemplate for multi-message prompts
✅ A conversation memory chain (ConversationChain)
✅ All powered by OCI Generative AI (Chat)
🧠 Full Example — LangChain + OCIChatGenAI Basics
# ------------------------------------------------------
# LangChain with OCIChatGenAI — Prompts, Chains & Memory
# ------------------------------------------------------
from langchain_community.chat_models import OCIChatGenAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.chains import LLMChain, ConversationChain
from langchain.memory import ConversationBufferMemory
# ------------------------------------------------------
# 1️⃣ Initialize OCI Chat Model
# ------------------------------------------------------
# Ensure your OCI config and AI endpoint are set up properly in ~/.oci/config
# with profile, tenancy, region, and credentials.
llm = OCIChatGenAI(
model="cohere.command-r-plus", # or "meta.llama-3-70b-instruct", etc.
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
compartment_id="ocid1.compartment.oc1..xxxxxxxxxxxx"
)
# ------------------------------------------------------
# 2️⃣ Simple Text Prompt
# ------------------------------------------------------
text = "Explain what Retrieval-Augmented Generation (RAG) means in simple terms."
response = llm.invoke(text)
print("\n🧩 Simple Prompt Response:")
print(response.content)
# ------------------------------------------------------
# 3️⃣ PromptTemplate with input variables
# ------------------------------------------------------
template = """You are a helpful AI assistant.
Explain the concept of {topic} in {style} terms."""
prompt = PromptTemplate.from_template(template)
formatted_prompt = prompt.format(topic="LangChain", style="simple")
print("\n📜 Formatted Prompt Text:")
print(formatted_prompt)
# Chain prompt and LLM
chain = LLMChain(llm=llm, prompt=prompt)
result = chain.invoke({"topic": "LangChain", "style": "simple"})
print("\n💬 Response from LLMChain:")
print(result["text"])
# ------------------------------------------------------
# 4️⃣ ChatPromptTemplate (multi-message style)
# ------------------------------------------------------
chat_prompt = ChatPromptTemplate.from_messages([
("system", "You are an expert AI explaining Oracle Generative AI concepts."),
("human", "Explain {concept} in one paragraph.")
])
chat_chain = LLMChain(llm=llm, prompt=chat_prompt)
chat_response = chat_chain.invoke({"concept": "vector embeddings"})
print("\n🤖 ChatPromptTemplate Response:")
print(chat_response["text"])
# ------------------------------------------------------
# 5️⃣ Memory — ConversationBufferMemory
# ------------------------------------------------------
memory = ConversationBufferMemory(return_messages=True)
conversation = ConversationChain(llm=llm, memory=memory, verbose=False)
print("\n🧠 Conversation with Memory:")
conversation.invoke({"input": "Hi, who are you?"})
conversation.invoke({"input": "Can you tell me about LangChain?"})
conversation.invoke({"input": "Summarize our conversation so far."})
# Inspect stored memory
print("\n🗂️ Memory Chat History:")
for msg in memory.chat_memory.messages:
print(f"{msg.type.upper()}: {msg.content}")
⚙️ Notes
-
Replace
compartment_idandservice_endpointwith your actual OCI Generative AI setup. -
The model name can be one of:
-
"cohere.command-r-plus" -
"meta.llama-3-70b-instruct" -
"mistral.mixtral-8x7b-instruct"(depending on region and availability).
-
-
This works best when your OCI SDK and credentials are configured under
~/.oci/config.
Would you like me to show the modern LCEL version (using the pipe syntax prompt | llm) next? It’s much more compact and ideal for RAG pipelines.
gen1.py
# -*- coding: utf-8 -*-
"""chatbot1.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1_Rmvf679gaZxbXHlmcbqwzK68H-rZURP
"""
pip install transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name = "microsoft/DialoGPT-small"
tokenizer= AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
chat_history_ids=None
print("start chat type'quit' to exi")
for step in range(5):
user_input=input("User")
if (user_input.lower() == 'quit'):
break
new_input_ids=tokenizer.encode(user_input+tokenizer.eos_token,return_tensors='pt')
if (chat_history_ids is not None):
bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1)
else :
bot_input_ids=new_input_ids
chat_history_ids=model.generate(bot_input_ids,max_length=1000, pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(
chat_history_ids[:, bot_input_ids.shape[-1]:][0],
skip_special_tokens=True
)
print(response)
LLM chat bot in 30 mins in this video at 2.45 to 3.15 time using langchain and streamlit
streamlit for UI # We can explore more to use web UI using streamlit
langchain for chat boat logic
