Post

Gen ai

Gen ai

title: GEN AI categories: [GEN AI]

tags : GEN AI, PDF SUMMARY


Import Libraries

#!pip install langchain langchain-community transformers sentence-transformers faiss-cpu pypdf

from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline

— Load PDF —

loader = PyPDFLoader(“document.pdf”) docs = loader.load()

— Split Documents into Chunks —

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = splitter.split_documents(docs)

— Create Embeddings and Vector Store —

embeddings = HuggingFaceEmbeddings(model_name=”sentence-transformers/all-MiniLM-L6-v2”) vectorstore = FAISS.from_documents(chunks, embeddings) retriever = vectorstore.as_retriever()

model_name = “google/flan-t5-large” tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) flan_pipeline = pipeline(“text2text-generation”, model=model, tokenizer=tokenizer)

def query_rag(question): relevant_docs = retriever.get_relevant_documents(question) context = “\n”.join([doc.page_content for doc in relevant_docs]) prompt = f”Answer the question using only the context:\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:” response = flan_pipeline( prompt, max_new_tokens=200, temperature=0.9, # Creativity control (lower = deterministic, higher = more diverse) top_k=50, # Only sample from the top-k most likely tokens top_p=0.9, # Nucleus sampling: only sample from tokens with cumulative prob <= top_p do_sample=True # Enables sampling (required for temperature/top-k/top-p to work) ) return response[0][‘generated_text’]

print(query_rag(“Summarize the key points of this document in a paragraph of 200 words.”))

This post is licensed under CC BY 4.0 by the author.