Spaces:

shimizukawa
/

python-no-senpai

Sleeping

App Files Files Community

terapyon commited on Aug 27, 2023

Commit

cd3709a

1 Parent(s): f913bc4

make to store functions

Browse files

Files changed (6) hide show

.gitignore +5 -0
README.md +3 -3
config.py +21 -0
gh_issue_loader.py +67 -0
requirments.txt +10 -0
store.py +59 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.vscode/
+__pycache__/
+venv/
+qdrant_storage/
+data/

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Gh Issue Search
 emoji: 🐠
 colorFrom: green
 colorTo: purple
-sdk: gradio
-sdk_version: 3.40.0
 app_file: app.py
 pinned: false
 license: mit

 ---
+title: Github Issue Search
 emoji: 🐠
 colorFrom: green
 colorTo: purple
+sdk: streamlit
+sdk_version: 1.25.0
 app_file: app.py
 pinned: false
 license: mit

config.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+SAAS = False
+def get_db_config():
+    url = os.environ["QDRANT_URL"]
+    api_key = os.environ["QDRANT_API_KEY"]
+    collection_name = "gh-issues"
+    return url, api_key, collection_name
+def get_local_db_congin():
+    url = "localhost"
+    # api_key = os.environ["QDRANT_API_KEY"]
+    collection_name = "gh-issues"
+    return url, None, collection_name
+DB_CONFIG = get_db_config() if SAAS else get_local_db_congin()

gh_issue_loader.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from dataclasses import dataclass, asdict
+import json
+from typing import Iterator
+from dateutil.parser import parse
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+@dataclass
+class Issue:
+    repo_name: str
+    id: int
+    title: str
+    created_at: int
+    user: str
+    url: str
+    labels: list[str]
+    type_: str
+def date_to_int(dt_str: str) -> int:
+    dt = parse(dt_str)
+    return int(dt.timestamp())
+def get_contents(repo_name: str, filename: str) -> Iterator[tuple[Issue, str]]:
+    with open(filename, "r") as f:
+        obj = [json.loads(line) for line in f]
+    for data in obj:
+        issue = Issue(
+            repo_name=repo_name,
+            id=data["number"],
+            title=data["title"],
+            created_at=date_to_int(data["created_at"]),
+            user=data["user.login"],
+            url=data["html_url"],
+            labels=data["labels_"],
+            type_="issue",
+        )
+        yield issue, data["body"]
+        comments = data["comments_"]
+        for comment in comments:
+            issue = Issue(
+                repo_name=repo_name,
+                id=comment["id"],
+                title=data["title"],
+                created_at=date_to_int(comment["created_at"]),
+                user=comment["user.login"],
+                url=comment["html_url"],
+                labels=data["labels_"],
+                type_="comment",
+            )
+            yield issue, comment["body"]
+class GHLoader(BaseLoader):
+    def __init__(self, repo_name: str, filename: str):
+        self.repo_name = repo_name
+        self.filename = filename
+    def lazy_load(self) -> Iterator[Document]:
+        for issue, text in get_contents(self.repo_name, self.filename):
+            metadata = asdict(issue)
+            yield Document(page_content=text, metadata=metadata)
+    def load(self) -> list[Document]:
+        return list(self.lazy_load())

requirments.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+langchain
+tiktoken
+qdrant-client
+torch
+transformers
+accelerate
+bitsandbytes
+sentence_transformers
+streamlit
+python-dateutil

store.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import Qdrant
+from gh_issue_loader import GHLoader
+from config import DB_CONFIG
+CHUNK_SIZE = 500
+def get_text_chunk(docs):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE, chunk_overlap=0
+    )
+    texts = text_splitter.split_documents(docs)
+    return texts
+def store(texts):
+    model_name = "intfloat/multilingual-e5-large"
+    model_kwargs = {"device": "cuda"}
+    encode_kwargs = {"normalize_embeddings": False}
+    embeddings = HuggingFaceEmbeddings(
+        model_name=model_name,
+        model_kwargs=model_kwargs,
+        encode_kwargs=encode_kwargs,
+    )
+    db_url, db_api_key, db_collection_name = DB_CONFIG
+    _ = Qdrant.from_documents(
+        texts,
+        embeddings,
+        url=db_url,
+        api_key=db_api_key,
+        collection_name=db_collection_name,
+    )
+def main(repo_name: str, path: str) -> None:
+    loader = GHLoader(repo_name, path)
+    docs = loader.load()
+    texts = get_text_chunk(docs)
+    store(texts)
+if __name__ == "__main__":
+    """
+    $ python store.py "REPO_NAME" "FILE_PATH"
+    $ python store.py cocoa data/cocoa-issues.json
+    """
+    import sys
+    args = sys.argv
+    if len(args) != 3:
+        print("No args, you need two args for repo_name, json_file_path")
+    else:
+        repo_name = args[1]
+        path = args[2]
+        main(repo_name, path)