Gen ai
title: GEN AI categories: [GEN AI]
tags : GEN AI, PDF SUMMARY
Import Libraries
#!pip install langchain langchain-community transformers sentence-transformers faiss-cpu pypdf
from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline
— Load PDF —
loader = PyPDFLoader(“document.pdf”) docs = loader.load()
— Split Documents into Chunks —
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = splitter.split_documents(docs)
— Create Embeddings and Vector Store —
embeddings = HuggingFaceEmbeddings(model_name=”sentence-transformers/all-MiniLM-L6-v2”) vectorstore = FAISS.from_documents(chunks, embeddings) retriever = vectorstore.as_retriever()
model_name = “google/flan-t5-large” tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) flan_pipeline = pipeline(“text2text-generation”, model=model, tokenizer=tokenizer)
def query_rag(question): relevant_docs = retriever.get_relevant_documents(question) context = “\n”.join([doc.page_content for doc in relevant_docs]) prompt = f”Answer the question using only the context:\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:” response = flan_pipeline( prompt, max_new_tokens=200, temperature=0.9, # Creativity control (lower = deterministic, higher = more diverse) top_k=50, # Only sample from the top-k most likely tokens top_p=0.9, # Nucleus sampling: only sample from tokens with cumulative prob <= top_p do_sample=True # Enables sampling (required for temperature/top-k/top-p to work) ) return response[0][‘generated_text’]
print(query_rag(“Summarize the key points of this document in a paragraph of 200 words.”))