Tutorials
TutorialsKeiro APIRAG

Building RAG Pipelines with Keiro API: A Complete Guide

Learn how to build production-ready RAG (Retrieval Augmented Generation) pipelines using Keiro API. Step-by-step tutorial with code examples.

Building RAG Pipelines with Keiro API: A Complete Guide

What is RAG?

Retrieval Augmented Generation (RAG) is a technique that enhances LLM responses by providing relevant external context. Instead of relying solely on the model's training data, RAG retrieves real-time information to generate more accurate, up-to-date responses.

Why Use Keiro for RAG?

Traditional RAG pipelines require:

  1. Crawling web pages

  2. Extracting content

  3. Chunking text

  4. Embedding and storing vectors

  5. Similarity search

  6. Context injection

Keiro simplifies this by handling steps 1-4 automatically:

const API_URL = "https://kierolabs.space/api/search-pro";

const payload = {
    query: "future of ai agents",
    apiKey: "YOUR_API_KEY"
};

const response = await fetch(API_URL, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify(payload)
});

const data = await response.json();

// Process results
console.log(`Credits remaining: ${data.creditsRemaining}`);
data.data?.extracted_content?.slice(0, 3).forEach(item => {
    console.log(`- ${item.title}`);
    console.log(`  ${item.url}`);
});

Basic RAG Implementation

Step 1: Set Up Your Environment

pip install openai

Step 2: Create the Retrieval Function

import requests
import openai

OPENAI_API_KEY = "your-openai-key"
KEIRO_API_KEY = "your-keiro-key"

openai.api_key = OPENAI_API_KEY

API_URL = "https://kierolabs.space/api/search-pro"


def retrieve_context(query: str, num_results: int = 5) -> str:
    """Retrieve relevant context using Keiro Research API."""
    
    payload = {
        "query": query,
        "apiKey": KEIRO_API_KEY,
        "cache_search": True
    }

    response = requests.post(API_URL, json=payload)
    data = response.json()

    # Extract content
    extracted = data.get("data", {}).get("extracted_content", [])[:num_results]

    context_parts = []
    for item in extracted:
        context_parts.append(f"""
Source: {item.get("title")}
URL: {item.get("url")}
Content: {item.get("content", "")[:1000]}
---
        """)

    return "\n".join(context_parts)

Step 3: Build the RAG Function

def rag_query(user_question: str) -> str:
    """Answer questions using RAG."""
    
    # Step 1: Retrieve context
    context = retrieve_context(user_question)
    
    # Step 2: Generate response with context
    response = openai.chat.completions.create(
        model="gpt-4o-mini",  # better + cheaper than gpt-4
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a helpful assistant. Answer the user's question "
                    "based ONLY on the provided context. "
                    "If the context does not contain the answer, say "
                    "'I could not find relevant information in the context.'\n\n"
                    f"Context:\n{context}"
                )
            },
            {
                "role": "user",
                "content": user_question
            }
        ],
        temperature=0.3
    )
    
    return response.choices[0].message.content

Step 4: Use It

answer = rag_query("What are the latest breakthroughs in fusion energy?")
print(answer)

Advanced RAG Patterns

Multi-Source Research RAG

For complex questions, use Keiro's research endpoint:

import requests
import openai

API_URL = "https://kierolabs.space/api/research-pro"
KEIRO_API_KEY = "your-keiro-key"

def research_rag(query: str) -> str:
    """Deep research with citations using Keiro Research API."""

    payload = {
        "query": query,
        "apiKey": KEIRO_API_KEY,
        "cache_search": True
    }

    response = requests.post(API_URL, json=payload)
    data = response.json()

    extracted = data.get("data", {}).get("extracted_content", [])[:10]

    # Build context with citations
    context_parts = []
    sources = []

    for i, item in enumerate(extracted):
        context_parts.append(f"[{i+1}] {item.get('content', '')[:800]}")
        sources.append(f"[{i+1}] {item.get('title')} - {item.get('url')}")

    context = f"""
Context:
{chr(10).join(context_parts)}

Sources:
{chr(10).join(sources)}
"""

    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "Answer ONLY using the provided context. "
                    "Include citation numbers like [1], [2] when referencing sources. "
                    "If unsure, say you don't know.\n\n"
                    f"{context}"
                )
            },
            {
                "role": "user",
                "content": query
            }
        ],
        temperature=0.3
    )

    return response.choices[0].message.content

Hybrid Search RAG

Combine semantic search with keyword matching:

import requests
import openai

API_URL = "https://kierolabs.space/api/search-pro"
KEIRO_API_KEY = "your-keiro-key"

def extract_key_terms(query: str):
    """Simple keyword extraction (you can improve this)."""
    return query.split()[:5]  # basic version


def search_api(query: str):
    payload = {
        "query": query,
        "apiKey": KEIRO_API_KEY,
    }
    response = requests.post(API_URL, json=payload)
    return response.json().get("data", [])


def merge_results(semantic, keyword):
    seen_urls = set()
    merged = []

    for result in semantic + keyword:
        url = result.get("url")
        if url and url not in seen_urls:
            seen_urls.add(url)
            merged.append(result)

    return merged[:8]  # limit context


def hybrid_rag(query: str) -> str:
    """Hybrid RAG: semantic + keyword search."""

    # Semantic search
    semantic_results = search_api(query)

    # Keyword search
    key_terms = extract_key_terms(query)
    keyword_query = " ".join(key_terms)
    keyword_results = search_api(keyword_query)

    # Merge
    all_results = merge_results(semantic_results, keyword_results)

    # Build context
    context_parts = []
    for i, item in enumerate(all_results):
        context_parts.append(f"""
[{i+1}] {item.get("title")}
{item.get("content", "")[:500]}
Source: {item.get("url")}
""")

    context = "\n".join(context_parts)

    # LLM step
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "Answer ONLY using the provided context. "
                    "Use citations like [1], [2]. If unsure, say you don't know.\n\n"
                    f"{context}"
                )
            },
            {"role": "user", "content": query}
        ],
        temperature=0.3
    )

    return response.choices[0].message.content

Streaming RAG

For real-time user experience:

from openai import AsyncOpenAI

client = AsyncOpenAI(api_key="your-openai-key")

async def streaming_rag(query: str):
    """Stream responses as they're generated."""

    # Step 1: Retrieve context
    context = retrieve_context(query)

    # Step 2: Create stream
    stream = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "Answer ONLY using the provided context. "
                    "If the answer is not in the context, say you don't know.\n\n"
                    f"Context:\n{context}"
                )
            },
            {"role": "user", "content": query}
        ],
        stream=True,
        temperature=0.3
    )

    # Step 3: Stream chunks safely
    async for chunk in stream:
        delta = chunk.choices[0].delta

        if hasattr(delta, "content") and delta.content:
            yield delta.content

Production Best Practices

1. Caching

Leverage Keiro's 50% cache discount:

import requests
from functools import lru_cache

API_URL = "https://kierolabs.space/api/search-pro"
KEIRO_API_KEY = "your-keiro-key"


@lru_cache(maxsize=1000)
def cached_retrieve(query: str):
    payload = {
        "query": query,
        "apiKey": KEIRO_API_KEY,
    }

    response = requests.post(API_URL, json=payload)
    return response.json()


def retrieve_with_cache(query: str):
    return cached_retrieve(query)

2. Error Handling

import requests
import time

API_URL = "https://kierolabs.space/api/search-pro"
KEIRO_API_KEY = "your-keiro-key"


def robust_retrieve(query: str, max_retries: int = 3):
    payload = {
        "query": query,
        "apiKey": KEIRO_API_KEY,
    }

    for attempt in range(max_retries):
        try:
            response = requests.post(API_URL, json=payload, timeout=10)

            # Handle HTTP errors
            if response.status_code == 200:
                return response.json()

            elif response.status_code == 429:
                # Rate limit
                time.sleep(2 ** attempt)

            elif response.status_code >= 500:
                # Server error → retry
                time.sleep(2 ** attempt)

            else:
                # Client error (400, 401, etc.) → don't retry
                raise Exception(f"Request failed: {response.text}")

        except requests.exceptions.RequestException as e:
            # Network issues
            if attempt == max_retries - 1:
                raise
            time.sleep(2 ** attempt)

    raise Exception("Max retries exceeded")

3. Batch Processing

For high-volume applications:

import requests
from concurrent.futures import ThreadPoolExecutor

API_URL = "https://kierolabs.space/api/batch-search"
API_KEY = "your_api_key_here"


def fetch_query(query: str):
    payload = {
        "query": query,
        "apiKey": API_KEY
    }

    response = requests.post(API_URL, json=payload)
    return response.json()


def batch_search(queries):
    results = []

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(fetch_query, q) for q in queries]

        for future in futures:
            results.append(future.result())

    return results

4. Context Window Management

def optimize_context(results, max_tokens: int = 4000):
    """Fit context within token limits."""
    context = ""
    token_count = 0
    
    for result in results:
        result_text = f"{result.title}\n{result.content}\n---\n"
        result_tokens = len(result_text.split()) * 1.3  # Rough estimate
        
        if token_count + result_tokens > max_tokens:
            break
            
        context += result_text
        token_count += result_tokens
    
    return context

Complete Example: Q&A Chatbot

import requests
import openai

class RAGChatbot:
    def __init__(self, keiro_key: str, openai_key: str):
        self.keiro_api_key = keiro_key
        openai.api_key = openai_key
        self.conversation_history = []
        self.api_url = "https://kierolabs.space/api/search-pro"
    
    def chat(self, user_message: str) -> str:
        # Retrieve relevant context
        context = self._retrieve(user_message)
        
        # Build messages
        messages = [
            {
                "role": "system",
                "content": (
                    "You are a helpful assistant with access to real-time web information. "
                    "Use ONLY the provided context to answer accurately. "
                    "If the answer is not in the context, say you don't know.\n\n"
                    f"Context:\n{context}"
                )
            }
        ] + self.conversation_history + [
            {"role": "user", "content": user_message}
        ]
        
        # Generate response
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            temperature=0.3
        )
        
        assistant_message = response.choices[0].message.content
        
        # Update history
        self.conversation_history.append({"role": "user", "content": user_message})
        self.conversation_history.append({"role": "assistant", "content": assistant_message})
        
        return assistant_message
    
    def _retrieve(self, query: str) -> str:
        payload = {
            "query": query,
            "apiKey": self.keiro_api_key,
        }

        response = requests.post(self.api_url, json=payload, timeout=10)
        data = response.json()

        results = data.get("data", [])

        return "\n---\n".join([
            f"Source: {item.get('title')}\nContent: {item.get('content', '')[:500]}"
            for item in results[:5]
        ])


# Usage
bot = RAGChatbot(keiro_key="your-keiro-key", openai_key="your-openai-key")
response = bot.chat("What happened in tech news today?")
print(response)

Conclusion

Keiro makes building RAG pipelines dramatically simpler and cheaper:

  • No infrastructure needed - Skip vector databases and crawlers

  • Real-time data - Always fresh results

  • Cost-effective - 10x cheaper than alternatives

  • Production-ready - Built-in caching and batch processing

Get started with Keiro and build your first RAG pipeline in minutes.

✦ ✦ ✦

Filed under

Keiro APIRAGTutorialLLMAI Development
K
About the author
Written by Keiro Team

Building the most cost-effective AI search API. We're on a mission to make web scraping accessible and affordable for every developer.

Try Keiro free
Get started today

Ready to build something?

Join thousands of developers using Keiro to power their AI applications.

Start free trial Read the docs