<a href="https://colab.research.google.com/github/AstraBert/qdurllm/blob/main/rageval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install --quiet datasets qdrant_client sentence_transformers langchain langchain_core langchain_community

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.1/254.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.6/983.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m366.3/366.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [2]:
from qdrant_client import models
from langchain_community.document_loaders.url import UnstructuredURLLoader
from langchain.text_splitter import CharacterTextSplitter


def urlload(urls):
    links = urls.split(",")
    try:
        loader = UnstructuredURLLoader(
            urls=links, method="elements",
            strategy="fast"
        )
        docs = loader.load()
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        pages = text_splitter.split_documents(docs)
        contents = [{"text": pages[i].page_content, "url": pages[i].metadata["source"]} for i in range(len(pages))]
        return contents
    except Exception as e:
        return f"An error occurred while parsing the URLs: {e}"

class NeuralSearcher:
    def __init__(self, collection_name, client, model):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = model
        # initialize Qdrant client
        self.qdrant_client = client
    def search(self, text: str, num_return: int):
        # Convert text query into vector
        vector = self.model.encode(text).tolist()

        # Use `vector` for search for closest vectors in the collection
        search_result = self.qdrant_client.search(
            collection_name=self.collection_name,
            query_vector=vector,
            query_filter=None,  # If you don't want any filters for now
            limit=num_return,  # 5 the most closest results is enough
        )
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # In this function you are interested in payload only
        payloads = [hit.payload for hit in search_result]
        return payloads



def upload_to_qdrant_collection(client, collection_name, encoder, documents):
    client.upload_points(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=idx, vector=encoder.encode(doc).tolist(), payload={"text": doc}
            )
            for idx, doc in enumerate(documents)
        ],
    )

def upload_to_qdrant_subcollection(client, collection_name, encoder, documents):
    client.delete_collection(collection_name=collection_name)
    client.create_collection(collection_name = collection_name,
        vectors_config=models.VectorParams(
            size = encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
            distance = models.Distance.COSINE,
        ),
    )
    client.upload_points(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=idx, vector=encoder.encode(doc["text"]).tolist(), payload=doc
            )
            for idx, doc in enumerate(documents)
        ],
    )


In [3]:
from huggingface_hub import login
from google.colab import userdata


token = userdata.get("HF_TOKEN")
login(token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from datasets import load_dataset

dataset = load_dataset("Lambent/synthetic-rag-hermes-simple-qa-1st-ic")

Downloading data:   0%|          | 0.00/2.17M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/358 [00:00<?, ? examples/s]

In [5]:
questions = dataset["train"]["instruction"]
answers = dataset["train"]["response"]

In [6]:
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer

qdrant_api_key = userdata.get("QDRANT_API_KEY")
qdrant_db = userdata.get("QDRANT_DB")

client = QdrantClient(
    url= qdrant_db,
    api_key= qdrant_api_key,
)

encoder = SentenceTransformer("all-MiniLM-L6-v2")
encoder1 = SentenceTransformer("sentence-t5-base")
coll_name = "Small_HTML_collection"
coll_name1 = "HTML_collection"
subcoll_name = "Subcollection"
small_subcoll_name = "Small_Subcollection"


client.recreate_collection(
    collection_name = coll_name,
    vectors_config=models.VectorParams(
        size = encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance = models.Distance.COSINE,
    ),
)

client.recreate_collection(
    collection_name = coll_name1,
    vectors_config=models.VectorParams(
        size = encoder1.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance = models.Distance.COSINE,
    ),
)


client.recreate_collection(
    collection_name = subcoll_name,
    vectors_config=models.VectorParams(
        size = encoder1.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance = models.Distance.COSINE,
    ),
)

client.recreate_collection(
    collection_name = small_subcoll_name,
    vectors_config=models.VectorParams(
        size = encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance = models.Distance.COSINE,
    ),
)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

rust_model.ot:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

  client.recreate_collection(
  client.recreate_collection(
  client.recreate_collection(
  client.recreate_collection(


True

In [7]:
def call_upload2qdrant(documents, coll_name, encoder):
    global client
    if type(documents) == list:
        try:
            upload_to_qdrant_collection(client, coll_name, encoder, documents)
            return "Successfully uploaded URL content to Qdrant collection"
        except Exception as e:
            return f"An error occured: {e}"
    else:
        return documents

def reranked_rag(client, encoder0, encoder1, collection0, collection1, message):
    txt2txt0 = NeuralSearcher(collection0, client, encoder0)
    search_results0 = txt2txt0.search(message, 10)
    upload_to_qdrant_subcollection(client, collection1, encoder1, search_results0)
    txt2txt1 = NeuralSearcher(collection1, client, encoder1)
    search_results1 = txt2txt1.search(message, 1)
    return search_results1

def direct_search(input_text):
    global client, encoder, encoder1, coll_name, subcoll_name
    results = reranked_rag(client, encoder, encoder1, coll_name, subcoll_name, input_text)
    return results[0]["text"]

In [8]:
if __name__=="__main__":
    docs = answers

    print(call_upload2qdrant(docs,coll_name,encoder))

    print(call_upload2qdrant(docs,coll_name1,encoder1))

    from math import ceil
    import time

    newdocs = {}
    for i in range(len(docs)):
        newdocs.update({questions[i]: docs[i]})

    print("Successful mapping")
    txt2txt0 = NeuralSearcher(coll_name, client, encoder)
    txt2txt1 = NeuralSearcher(coll_name1, client, encoder1)

    times0 = []
    times1 = []
    times01 = []
    times10 = []
    points0 = 0
    points1 = 0
    points01 = 0
    points10 = 0

    from statistics import mean, stdev

    print("Started benchmark")
    for k in newdocs:
        start0 = time.time()
        results0 = txt2txt0.search(k,1)
        end0 = time.time()
        if results0[0]["text"] == newdocs[k]:
            points0+=1
        times0.append(end0-start0)
        start1 = time.time()
        results1 = txt2txt1.search(k,1)
        end1 = time.time()
        if results1[0]["text"] == newdocs[k]:
            points1+=1
        times1.append(end1-start1)
        start01 = time.time()
        results01 = reranked_rag(client, encoder, encoder1, coll_name, subcoll_name, k)
        end01 = time.time()
        if results01[0]["text"] == newdocs[k]:
            points01+=1
        times01.append(end01-start01)
        start10 = time.time()
        results10 = reranked_rag(client, encoder1, encoder, coll_name1, small_subcoll_name, k)
        end10 = time.time()
        if results10[0]["text"] == newdocs[k]:
            points10+=1
        times10.append(end10-start10)

    print(f"Avg time for All-MiniLM-L6-v2: {mean(times0)} +/- {stdev(times0)}")
    print(f"Avg time for sentence-t5-base: {mean(times1)} +/- {stdev(times1)}")
    print(f"Avg time for All-MiniLM-L6-v2 + sentence-t5-base: {mean(times01)} +/- {stdev(times01)}")
    print(f"Avg time for sentence-t5-base + All-MiniLM-L6-v2: {mean(times10)} +/- {stdev(times10)}")
    print(f"Correct/Total retrievals for All-MiniLM-L6-v2: {points0/len(newdocs)}")
    print(f"Correct/Total retrievals for sentence-t5-base: {points1/len(newdocs)}")
    print(f"Correct/Total retrievals for All-MiniLM-L6-v2 + sentence-t5-base: {points01/len(newdocs)}")
    print(f"Correct/Total retrievals for sentence-t5-base + All-MiniLM-L6-v2: {points10/len(newdocs)}")

Successfully uploaded URL content to Qdrant collection
Successfully uploaded URL content to Qdrant collection
Successful mapping
Started benchmark
Avg time for All-MiniLM-L6-v2: 0.13889479704117508 +/- 0.018010675079187972
Avg time for sentence-t5-base: 0.3546350625123871 +/- 0.1378480017367839
Avg time for All-MiniLM-L6-v2 + sentence-t5-base: 10.722357098306164 +/- 1.2552639024596886
Avg time for sentence-t5-base + All-MiniLM-L6-v2: 2.722710152020615 +/- 0.30703480688834284
Correct/Total retrievals for All-MiniLM-L6-v2: 0.41853932584269665
Correct/Total retrievals for sentence-t5-base: 0.5084269662921348
Correct/Total retrievals for All-MiniLM-L6-v2 + sentence-t5-base: 0.5028089887640449
Correct/Total retrievals for sentence-t5-base + All-MiniLM-L6-v2: 0.42134831460674155
