Simple RAG
In [ ]:
Copied!
!git clone https://github.com/adithya-s-k/VARAG
%cd VARAG
%pwd
!git clone https://github.com/adithya-s-k/VARAG
%cd VARAG
%pwd
In [ ]:
Copied!
!apt-get update && apt-get install -y && apt-get install -y poppler-utils
!apt-get update && apt-get install -y && apt-get install -y poppler-utils
In [ ]:
Copied!
%pip install -e .
## We will be using Docling for OCR
%pip install docling
%pip install -e .
## We will be using Docling for OCR
%pip install docling
In [1]:
Copied!
from sentence_transformers import SentenceTransformer
from varag.rag import SimpleRAG
from varag.llms import OpenAI
from varag.llms import LiteLLM
from varag.chunking import FixedTokenChunker
import lancedb
import os
from dotenv import load_dotenv
# os.environ["OPENAI_API_KEY"] = "api-key"
load_dotenv()
from sentence_transformers import SentenceTransformer
from varag.rag import SimpleRAG
from varag.llms import OpenAI
from varag.llms import LiteLLM
from varag.chunking import FixedTokenChunker
import lancedb
import os
from dotenv import load_dotenv
# os.environ["OPENAI_API_KEY"] = "api-key"
load_dotenv()
/home/adithya/miniconda3/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console) from tqdm.autonotebook import tqdm, trange
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[1], line 2 1 from sentence_transformers import SentenceTransformer ----> 2 from varag.rag import SimpleRAG 3 from varag.llms import OpenAI 4 from varag.llms import LiteLLM File ~/workspace/VARAG/varag/rag/__init__.py:1 ----> 1 from ._simpleRAG import SimpleRAG 2 from ._colpaliRAG import ColpaliRAG 3 from ._hybridColpaliRAG import HybridColpaliRAG File ~/workspace/VARAG/varag/rag/_simpleRAG.py:16 14 from openai import OpenAI 15 from dotenv import load_dotenv ---> 16 from varag.chunking import BaseChunker, FixedTokenChunker 17 from sklearn.metrics import precision_score, recall_score, f1_score 18 import pandas as pd ModuleNotFoundError: No module named 'varag.chunking'
In [ ]:
Copied!
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
In [ ]:
Copied!
# Initialize OpenAI LLM
# llm = LiteLLM(model="gpt-4o-mini" , is_vision_required=True, api_key=OPENAI_API_KEY)
llm = LiteLLM(model="gpt-3.5-turbo" , is_vision_required=True, api_key=OPENAI_API_KEY)
# Initialize OpenAI LLM
# llm = LiteLLM(model="gpt-4o-mini" , is_vision_required=True, api_key=OPENAI_API_KEY)
llm = LiteLLM(model="gpt-3.5-turbo" , is_vision_required=True, api_key=OPENAI_API_KEY)
In [ ]:
Copied!
response = llm.query(query="What is you name?" , )
print(response)
response = llm.query(query="What is you name?" , )
print(response)
In [ ]:
Copied!
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)
# embedding_model = SentenceTransformer("BAAI/bge-base-en", trust_remote_code=True)
# embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5", trust_remote_code=True)
# embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5", trust_remote_code=True)
# Initialize shared database
shared_db = lancedb.connect("~/shared_rag_db")
# Initialize TextRAG with shared database
text_rag = SimpleRAG(
text_embedding_model=embedding_model,
db=shared_db,
table_name="textDemo",
)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Initialize OpenAI LLM
llm = LiteLLM(model="gpt-4o-mini" , is_vision_required=True, api_key=OPENAI_API_KEY)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", trust_remote_code=True)
# embedding_model = SentenceTransformer("BAAI/bge-base-en", trust_remote_code=True)
# embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5", trust_remote_code=True)
# embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5", trust_remote_code=True)
# Initialize shared database
shared_db = lancedb.connect("~/shared_rag_db")
# Initialize TextRAG with shared database
text_rag = SimpleRAG(
text_embedding_model=embedding_model,
db=shared_db,
table_name="textDemo",
)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Initialize OpenAI LLM
llm = LiteLLM(model="gpt-4o-mini" , is_vision_required=True, api_key=OPENAI_API_KEY)
In [ ]:
Copied!
text_rag.index(
"./examples/data",
recursive=False,
chunking_strategy=FixedTokenChunker(chunk_size=1000),
metadata={"source": "gradio_upload"},
overwrite=True,
verbose=True,
ocr=True,
)
text_rag.index(
"./examples/data",
recursive=False,
chunking_strategy=FixedTokenChunker(chunk_size=1000),
metadata={"source": "gradio_upload"},
overwrite=True,
verbose=True,
ocr=True,
)
In [ ]:
Copied!
query = "what is colpali ?"
num_results = 5
search_results = text_rag.search(query, k=num_results)
print("This was the retrieved Context")
for i, r in enumerate(search_results):
print(f"{'==='*50}")
print(f"\n\nChunk {i+1}:")
print(f"Text: {r['text']}")
print(f"Chunk Index: {r['chunk_index']}")
print(f"Document Name: {r['document_name']}")
print(f"\n\n{'==='*50}")
query = "what is colpali ?"
num_results = 5
search_results = text_rag.search(query, k=num_results)
print("This was the retrieved Context")
for i, r in enumerate(search_results):
print(f"{'==='*50}")
print(f"\n\nChunk {i+1}:")
print(f"Text: {r['text']}")
print(f"Chunk Index: {r['chunk_index']}")
print(f"Document Name: {r['document_name']}")
print(f"\n\n{'==='*50}")
In [ ]:
Copied!
from IPython.display import display, Markdown, Latex
context = "\n".join([r["text"] for r in search_results])
response = llm.query(
context=context,
system_prompt="Given the below information answer the questions",
query=query,
)
display(Markdown(response))
from IPython.display import display, Markdown, Latex
context = "\n".join([r["text"] for r in search_results])
response = llm.query(
context=context,
system_prompt="Given the below information answer the questions",
query=query,
)
display(Markdown(response))
Run Gradio Demo¶
In [ ]:
Copied!
%cd examples
!python textDemo.py --share
%cd examples
!python textDemo.py --share