Building Production RAG Systems
A deep dive into building Retrieval Augmented Generation systems that actually work in production.
•6 min read
const{Fragment:e,jsx:n,jsxs:r}=arguments[0];function _createMdxContent(i){const t={code:"code",em:"em",h1:"h1",h2:"h2",h3:"h3",li:"li",ol:"ol",p:"p",pre:"pre",strong:"strong",...i.components};return r(e,{children:[n(t.h1,{children:"Building Production RAG Systems"}),"\n",n(t.p,{children:"Retrieval Augmented Generation (RAG) has become the de-facto standard for building LLM applications with custom data. In this post, I'll share my experience building and scaling RAG systems for enterprise use cases."}),"\n",n(t.h2,{children:"The Challenge"}),"\n",r(t.p,{children:["Most RAG tutorials stop at ",n(t.code,{children:"pip install langchain"}),". But production is a different beast. unique challenges include:"]}),"\n",r(t.ol,{children:["\n",r(t.li,{children:[n(t.strong,{children:"Retriever Quality"}),": Vector search isn't magic. It often fails to capture semantic meaning effectively for domain-specific queries."]}),"\n",r(t.li,{children:[n(t.strong,{children:"Context Window Limits"}),": Even with 128k context windows, stuffing irrelevant documents hurts performance (Lost in the Middle phenomenon)."]}),"\n",r(t.li,{children:[n(t.strong,{children:"Latency"}),": Users expect sub-second responses. Chaining multiple LLM calls adds up."]}),"\n"]}),"\n",n(t.h2,{children:"Architecture"}),"\n",n(t.p,{children:"Here is the high-level architecture we settled on:"}),"\n",n(t.pre,{children:n(t.code,{className:"language-mermaid",children:"graph LR\n A[User Query] --\x3e B[Query Expansion]\n B --\x3e C[Hybrid Search]\n C --\x3e D[Reranking]\n D --\x3e E[LLM Context Stuffing]\n E --\x3e F[Generation]\n"})}),"\n",n(t.h3,{children:"Hybrid Search"}),"\n",n(t.p,{children:"We found that relying solely on vector search (Dense Retrieval) wasn't enough. We combined it with BM25 (Sparse Retrieval) to capture keyword matches."}),"\n",n(t.pre,{children:n(t.code,{className:"language-python",children:'from langchain.retrievers import EnsembleRetriever\nfrom langchain_community.retrievers import BM25Retriever\nfrom langchain_community.vectorstores import FAISS\n\n# Initialize retrievers\nbm25_retriever = BM25Retriever.from_texts(doc_list)\nfaiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})\n\n# Initialize ensemble retriever\nensemble_retriever = EnsembleRetriever(\n retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]\n)\n'})}),"\n",n(t.h2,{children:"Reranking"}),"\n",r(t.p,{children:["This was the biggest game changer. Using a Cross-Encoder like ",n(t.code,{children:"bge-reranker-v2-m3"})," re-ranks the retrieved documents based on their actual relevance to the query."]}),"\n",n(t.h2,{children:"Conclusion"}),"\n",r(t.p,{children:["Building RAG is easy. Building ",n(t.em,{children:"good"})," RAG is hard. Start simple, measure everything using a framework like Ragas, and iterate."]})]})}return{default:function(e={}){const{wrapper:r}=e.components||{};return r?n(r,{...e,children:n(_createMdxContent,{...e})}):_createMdxContent(e)}};
AIRAGLLMPython
Subscribe to Newsletter
Get the latest updates on AI, Web3, and Tech.