Gen AI Training in Hyderabad Kukatpally

RAG

 

Here’s the corrected and working version of your snippet to read a PDF, extract text, and split it into chunks using LangChain’s RecursiveCharacterTextSplitter:

######################################

from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# Load the PDF file
pdf = PdfReader('./pdf-docs/oci-ai-foundations.pdf')

# Extract text from all pages
text = ""
for page in pdf.pages:
text += page.extract_text() # <-- add parentheses to call the method

# Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # you can adjust this value
chunk_overlap=100, # overlap between chunks
separators=["\n\n", "\n", " ", ""]
)

# Step 4: Split the text using the splitter
chunks = text_splitter.split_text(text)

# Create document objects

# Step 5: Convert each text chunk into a LangChain Document object.
# Each Document stores both the text (page_content) and optional metadata (like page number, source, etc.)

docs = []
for i, chunk in enumerate(chunks):
docs.append(Document(
page_content=chunk,
metadata={"source": "oci-ai-foundations.pdf", "chunk_index": i}
))

 

######################################

 

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import OracleVectorStore

def store_to_oracle_vector(
chunks: list[str],
source_file: str,
embedding_model: Embeddings,
connection_string: str,
collection_name: str = "oci_ai_docs"
):
"""
Convert text chunks into Documents, embed them, and store them in Oracle 23 AI Vector Store.

Parameters
----------
chunks : list[str]
The list of text chunks extracted from your PDF or other sources.
source_file : str
Name or path of the source document (used in metadata).
embedding_model : Embeddings
Any LangChain-compatible embedding model (e.g., OpenAIEmbeddings, HuggingFaceEmbeddings).
connection_string : str
Oracle Database connection string in the format:
oracle+oracledb://user:password@hostname:port/service_name
collection_name : str, optional
Logical name of the vector collection in Oracle (default "oci_ai_docs").

Returns
-------
OracleVectorStore
The initialized and populated Oracle Vector Store object.
"""

# Step 1: Create Document objects with metadata

# Create an empty list to hold all Document objects
docs = []

# Loop through each text chunk and its index
for i, chunk in enumerate(chunks):
# Create a Document with content and metadata
doc = Document(
page_content=chunk, # the text content of the chunk
metadata={
"source": source_file, # original file name or path
"chunk_index": i, # order of the chunk in the document
"doc_type": "pdf" # type of file (can adjust as needed)
}
)
# Add the Document to the list
docs.append(doc)

# Now 'docs' holds all Document objects

# Step 2: Initialize Oracle Vector Store and store documents
vector_store = OracleVectorStore.from_documents(
documents=docs,
embedding=embedding_model,
collection_name=collection_name,
connection_string=connection_string
)

print(f"✅ Stored {len(docs)} chunks from '{source_file}' into Oracle Vector Store collection '{collection_name}'.")
return vector_store

USAGE

from langchain_openai import OpenAIEmbeddings

# Example: create embeddings and store chunks
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

connection = "oracle+oracledb://admin:MyPass@myhost:1521/myservice"

store_to_oracle_vector(
chunks=chunks,
source_file="oci-ai-foundations.pdf",
embedding_model=embedding_model,
connection_string=connection,
collection_name="ai_docs_demo"
)

# Optional: print the first chunk to verify
print(docs[0].page_content)

 

🧩 Explanation

  • PdfReader reads your PDF file.

  • extract_text() pulls out all the text from each page.

  • RecursiveCharacterTextSplitter divides the big text into smaller chunks that are easier for AI models to process later.

  • Document objects store each chunk, which you can later feed into embedding or question-answering models.

LANGCHAIN Basics

 

 full LangChain basics demo, but this time using Oracle’s OCIChatGenAI instead of OpenAI’s ChatOpenAI.

This shows you how to build:
✅ A simple text invocation
✅ A PromptTemplate with variables
✅ A ChatPromptTemplate for multi-message prompts
✅ A conversation memory chain (ConversationChain)
✅ All powered by OCI Generative AI (Chat)


🧠 Full Example — LangChain + OCIChatGenAI Basics

# ------------------------------------------------------
# LangChain with OCIChatGenAI — Prompts, Chains & Memory
# ------------------------------------------------------

from langchain_community.chat_models import OCIChatGenAI
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.chains import LLMChain, ConversationChain
from langchain.memory import ConversationBufferMemory


# ------------------------------------------------------
# 1️⃣ Initialize OCI Chat Model
# ------------------------------------------------------
# Ensure your OCI config and AI endpoint are set up properly in ~/.oci/config
# with profile, tenancy, region, and credentials.

llm = OCIChatGenAI(
model="cohere.command-r-plus", # or "meta.llama-3-70b-instruct", etc.
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
compartment_id="ocid1.compartment.oc1..xxxxxxxxxxxx"
)


# ------------------------------------------------------
# 2️⃣ Simple Text Prompt
# ------------------------------------------------------
text = "Explain what Retrieval-Augmented Generation (RAG) means in simple terms."
response = llm.invoke(text)
print("\n🧩 Simple Prompt Response:")
print(response.content)


# ------------------------------------------------------
# 3️⃣ PromptTemplate with input variables
# ------------------------------------------------------
template = """You are a helpful AI assistant.
Explain the concept of {topic} in {style} terms."""

prompt = PromptTemplate.from_template(template)

formatted_prompt = prompt.format(topic="LangChain", style="simple")
print("\n📜 Formatted Prompt Text:")
print(formatted_prompt)

# Chain prompt and LLM
chain = LLMChain(llm=llm, prompt=prompt)

result = chain.invoke({"topic": "LangChain", "style": "simple"})
print("\n💬 Response from LLMChain:")
print(result["text"])


# ------------------------------------------------------
# 4️⃣ ChatPromptTemplate (multi-message style)
# ------------------------------------------------------
chat_prompt = ChatPromptTemplate.from_messages([
("system", "You are an expert AI explaining Oracle Generative AI concepts."),
("human", "Explain {concept} in one paragraph.")
])

chat_chain = LLMChain(llm=llm, prompt=chat_prompt)

chat_response = chat_chain.invoke({"concept": "vector embeddings"})
print("\n🤖 ChatPromptTemplate Response:")
print(chat_response["text"])


# ------------------------------------------------------
# 5️⃣ Memory — ConversationBufferMemory
# ------------------------------------------------------
memory = ConversationBufferMemory(return_messages=True)
conversation = ConversationChain(llm=llm, memory=memory, verbose=False)

print("\n🧠 Conversation with Memory:")
conversation.invoke({"input": "Hi, who are you?"})
conversation.invoke({"input": "Can you tell me about LangChain?"})
conversation.invoke({"input": "Summarize our conversation so far."})

# Inspect stored memory
print("\n🗂️ Memory Chat History:")
for msg in memory.chat_memory.messages:
print(f"{msg.type.upper()}: {msg.content}")

 


⚙️ Notes

  • Replace compartment_id and service_endpoint with your actual OCI Generative AI setup.

  • The model name can be one of:

    • "cohere.command-r-plus"

    • "meta.llama-3-70b-instruct"

    • "mistral.mixtral-8x7b-instruct" (depending on region and availability).

  • This works best when your OCI SDK and credentials are configured under ~/.oci/config.


Would you like me to show the modern LCEL version (using the pipe syntax prompt | llm) next? It’s much more compact and ideal for RAG pipelines.

 

 

 

 

gen1.py

pip install transformers
from transformers import pipeline
generator=pipeline('text-generation', model='gpt2')
prompt="In 1989 sachin tendulkar debuted in cricket"
result=generator(prompt,max_length=100,num_return_sequences=1)
print(result[0]['generated_text'])
try this and observe the differenece in output
result=generator(prompt,max_length=100,num_return_sequences=1,temperature=0.7,
top_k=50,top_p=0.95,do_sample=True)
print(result[0]['generated_text'])
CHATBOT

# -*- coding: utf-8 -*-
"""chatbot1.ipynb

Automatically generated by Colab.

Original file is located at
https://colab.research.google.com/drive/1_Rmvf679gaZxbXHlmcbqwzK68H-rZURP
"""

pip install transformers

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "microsoft/DialoGPT-small"
tokenizer= AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

chat_history_ids=None
print("start chat type'quit' to exi")

for step in range(5):
user_input=input("User")
if (user_input.lower() == 'quit'):
break

new_input_ids=tokenizer.encode(user_input+tokenizer.eos_token,return_tensors='pt')

if (chat_history_ids is not None):
bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1)
else :
bot_input_ids=new_input_ids

chat_history_ids=model.generate(bot_input_ids,max_length=1000, pad_token_id=tokenizer.eos_token_id)

response = tokenizer.decode(
chat_history_ids[:, bot_input_ids.shape[-1]:][0],
skip_special_tokens=True
)
print(response)

LLM chat bot in 30 mins in this video at 2.45 to 3.15 time using langchain and streamlit

streamlit for UI # We can explore more to use web UI using streamlit

langchain for chat boat logic