8000 Feature: Github Retriever and PR Summary Chain by Arcadia822 · Pull Request #9 · codedog-ai/codedog · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Feature: Github Retriever and PR Summary Chain #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

8000
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ max-line-length = 120
exclude=
.venv
__pycache__
tmp/
2 changes: 2 additions & 0 deletions codedog/__init__,py → codedog/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# flake8: noqa

from .version import VERSION

verbose: bool = False
3 changes: 3 additions & 0 deletions codedog/chains/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from codedog.chains.pr_summary.base import PRSummaryChain

__all__ = ["PRSummaryChain"]
Empty file.
167 changes: 167 additions & 0 deletions codedog/chains/pr_summary/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from __future__ import annotations

from typing import Any, Dict, List, Optional

from langchain import BasePromptTemplate, LLMChain
from langchain.base_language import BaseLanguageModel
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
)
from langchain.chains.base import Chain
from langchain.output_parsers import OutputFixingParser, PydanticOutputParser
from langchain.schema import BaseOutputParser
from pydantic import Extra, Field

from codedog.chains.pr_summary.processor import (
SUFFIX_LANGUAGE_MAPPING,
PRSummaryProcessor,
)
from codedog.chains.pr_summary.prompts import CODE_SUMMARY_PROMPT, PR_SUMMARY_PROMPT
from codedog.models import ChangeSummary, PRSummary, PullRequest


class PRSummaryChain(Chain):
"""Summarize a pull request.

Inputs are:
- pull_request(PullRequest): a pull request object

Outputs are:
- pr_summary(PRSummary): summary of pull request.
- code_summaries(Dict[str, str]): changed code file summarizations, key is file path.
"""

# TODO: localization
# TODO: prompt input keys validation

code_summary_chain: LLMChain = Field(exclude=True)
"""Chain to use to summarize code change."""
pr_summary_chain: LLMChain = Field(exclude=True)
"""Chain to use to summarize PR."""
parser: BaseOutputParser = Field(exclude=True)
"""Parse pr summarized result to PRSummary object."""
processor: PRSummaryProcessor = Field(exclude=True, default_factory=PRSummaryProcessor)
"""PR data process."""
_input_keys: List[str] = ["pull_request"]
_output_keys: List[str] = ["pr_summary", "code_summaries"]

class Config:
"""Configuration for this pydantic object."""

extra = Extra.forbid
arbitrary_types_allowed = True

@property
def _chain_type(self) -> str:
return "pull_request_summary_chain"

@property
def input_keys(self) -> List[str]:
"""Will be whatever keys the prompt expects.

:meta private:
"""
return self._input_keys

@property
def output_keys(self) -> List[str]:
"""Will always return text key.

:meta private:
"""
return self._output_keys

def _call(self, inputs: Dict[str, Any], run_manager: Optional[CallbackManagerForChainRun] = None) -> Dict[str, Any]:
# TODO: handle callbacks

_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
_run_manager.on_text(inputs["pull_request"].json() + "\n")

pr: PullRequest = inputs["pull_request"]

code_summary_inputs = self._process_code_summary_inputs(pr, run_manager)
code_summary_outputs = (
self.code_summary_chain.apply(code_summary_inputs, callbacks=_run_manager.get_child(tag="CodeSummary"))
if code_summary_inputs
else []
)

code_summaries = self.processor.build_change_summaries(code_summary_inputs, code_summary_outputs)

pr_summary_input = self._process_pr_summary_input(pr, code_summaries)
pr_summary_output: PRSummary = self.pr_summary_chain(pr_summary_input, callbacks=_run_manager.get_child())

return self._process_result(pr_summary_output, code_summaries)

async def _acall(
self,
inputs: Dict[str, Any],
run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
_run_manager.on_text(inputs["pull_request"].json() + "\n")

pr: PullRequest = inputs["pull_request"]

code_summary_inputs = self._process_code_summary_inputs(pr)
code_summary_outputs = (
await self.code_summary_chain.aapply(code_summary_inputs, callbacks=run_manager.get_child())
if code_summary_inputs
else []
)

code_summaries = self.processor.build_code_summaries(code_summary_inputs, code_summary_outputs)

pr_summary_input = self._process_pr_summary_input(pr, code_summaries)
pr_summary_output: PRSummary = await self.pr_summary_chain.apredict_and_parse(pr_summary_input)

return self._process_result(pr_summary_output, code_summaries)

def _process_code_summary_inputs(
self,
pr: PullRequest,
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> List[Dict[str, str]]:
input_data = []
code_files = self.processor.get_diff_code_files(pr)
for code_file in code_files:
input_item = {
"content": code_file.diff_content,
"name": code_file.full_name,
"language": SUFFIX_LANGUAGE_MAPPING.get(code_file.suffix, ""),
}
input_data.append(input_item)

run_manager.on_text(f"Prepare code diff content for {len(input_data)} files.\n")
return input_data

def _process_pr_summary_input(self, pr: PullRequest, code_summaries: List[ChangeSummary]) -> Dict[str, str]:
change_files_material: str = self.processor.gen_material_change_files(pr.change_files)
code_summaries_material = self.processor.gen_material_code_summaries(code_summaries)
pr_metadata_material = self.processor.gen_material_pr_metadata(pr)
return {
"change_files": change_files_material,
"code_summaries": code_summaries_material,
"metadata": pr_metadata_material,
}

def _process_result(self, pr_summary: PRSummary, code_summaries: List[ChangeSummary]):
return {
"pr_summary": pr_summary,
"code_summaries": code_summaries,
}

@classmethod
def from_llm(
cls,
code_summary_llm: BaseLanguageModel,
pr_summary_llm: BaseLanguageModel,
code_summary_prompt: BasePromptTemplate = CODE_SUMMARY_PROMPT,
pr_summary_prompt: BasePromptTemplate = PR_SUMMARY_PROMPT,
**kwargs: Any,
) -> PRSummaryChain:
parser = OutputFixingParser.from_llm(llm=pr_summary_llm, parser=PydanticOutputParser(pydantic_object=PRSummary))
code_summary_chain = LLMChain(llm=code_summary_llm, prompt=code_summary_prompt)
pr_summary_chain = LLMChain(llm=pr_summary_llm, prompt=pr_summary_prompt, output_parser=parser)
return cls(code_summary_chain=code_summary_chain, pr_summary_chain=pr_summary_chain, parser=parser, **kwargs)
125 changes: 125 additions & 0 deletions codedog/chains/pr_summary/processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import itertools
from typing import Dict, List

from codedog.models import ChangeFile, ChangeStatus, ChangeSummary, PullRequest
from codedog.templates import template_cn, template_en

CONTENT_CHANGE_STATUS = [ChangeStatus.addition, ChangeStatus.modified]

SUPPORT_CODE_FILE_SUFFIX = set(["py", "java", "go", "js", "ts", "php", "c", "cpp", "h", "cs", "rs"])

SUFFIX_LANGUAGE_MAPPING = {
"py": "python",
"java": "java",
"go": "go",
"js": "javascript",
"ts": "typescript",
"php": "php",
"c": "c",
"cpp": "cpp",
"h": "c",
"cs": "csharp",
"rs": "rust",
}

STATUS_HEADER_MAPPING = {
ChangeStatus.addition: "Added files:",
ChangeStatus.copy: "Copied files:",
ChangeStatus.deletion: "Deleted files:",
ChangeStatus.modified: "Modified files:",
ChangeStatus.renaming: "Renamed files:",
ChangeStatus.type_change: "Type changed files:",
ChangeStatus.unknown: "Other files:",
}


class PRSummaryProcessor:
# TODO: localization
def __init__(self, language: str = "en"):
self._status_template_functions = {
ChangeStatus.copy: self._build_status_template_copy,
ChangeStatus.renaming: self._build_status_template_rename,
}

self.language = language
self.template = template_en if language == "en" else template_cn

def is_code_file(self, change_file: ChangeFile):
return change_file.suffix in SUPPORT_CODE_FILE_SUFFIX

def get_diff_code_files(self, pr: PullRequest) -> list[ChangeFile]:
diff_code_files = []
for change_file in pr.change_files:
if change_file.status in CONTENT_CHANGE_STATUS and self.is_code_file(change_file):
diff_code_files.append(change_file)

return diff_code_files

def gen_material_change_files(self, change_files: list[ChangeFile]) -> str:
files_by_status = itertools.groupby(change_files, lambda change_file: change_file.status)
summary_by_status = []

for status, files in files_by_status:
summary_by_status.append(
f"{STATUS_HEADER_MAPPING.get(status, ChangeStatus.unknown)}\n"
+ "\n".join(
self._status_template_functions.get(status, self._build_status_template_default)(file)
for file in files
)
+ "\n"
)

return "\n".join(summary_by_status)

def gen_material_code_summaries(self, code_summaries: list[ChangeSummary]) -> str:
return (
"\n\n".join(
self.template.MATERIAL_CODE_SUMMARY.format(summary=code_summary.summary, name=code_summary.full_name)
for code_summary in code_summaries
)
+ "\n"
)

def gen_material_pr_metadata(self, pr: PullRequest) -> str:
return self.template.MATERIAL_PR_METADATA.format(
pr_title=pr.title,
pr_body=pr.body,
issues="\n".join(f"- {issue.title}" for issue in pr.related_issues),
)

def build_change_summaries(
self, summaries_input: List[Dict[str, str]], summaries_output: List[Dict[str, str]]
) -> List[ChangeSummary]:
result = []
for i, o in itertools.zip_longest(summaries_input, summaries_output):
result.append(ChangeSummary(full_name=i["name"], summary=o["text"]))

return result

def _build_status_template_default(self, change_file: ChangeFile):
return f"- {change_file.full_name}"

def _build_status_template_copy(self, change_file: ChangeFile):
return f"- {change_file.full_name} (copied from {change_file.source_full_name})"

def _build_status_template_rename(self, change_file: ChangeFile):
return f"- {change_file.full_name} (renamed from {change_file.source_full_name})"


if __name__ == "__main__":
import os

from github import Github

from codedog.retrievers import GithubRetriever

client = Github(os.environ.get("GITHUB_TOKEN"))
retriever = GithubRetriever(client, "codedog-ai/codedog", 2)
pull_request = retriever.pull_request

pr_preprocess = PRSummaryProcessor()
print(pr_preprocess.gen_material_change_files(pull_request.change_files))

code_files = pr_preprocess.get_diff_code_files(pull_request.change_files)
for code_file in code_files:
print(code_file.full_name)
14 changes: 14 additions & 0 deletions codedog/chains/pr_summary/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from langchain import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

from codedog.models import PRSummary
from codedog.templates import grimoire_en

parser = PydanticOutputParser(pydantic_object=PRSummary)

PR_SUMMARY_PROMPT = PromptTemplate(
template=grimoire_en.PR_SUMMARY,
input_variables=["metadata", "change_files", "code_summaries"],
partial_variables={"format_instructions": parser.get_format_instructions()},
)
CODE_SUMMARY_PROMPT = PromptTemplate(template=grimoire_en.CODE_SUMMARY, input_variables=["name", "language", "content"])
File renamed without changes.
24 changes: 24 additions & 0 deletions codedog/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from codedog.models.blob import Blob
from codedog.models.change_file import ChangeFile, ChangeStatus
from codedog.models.change_summary import ChangeSummary
from codedog.models.commit import Commit
from codedog.models.diff import DiffContent, DiffSegment
from codedog.models.issue import Issue
from codedog.models.pr_summary import PRSummary, PRType
from codedog.models.pull_request import PullRequest
from codedog.models.repository import Repository

__all__ = [
"Blob",
"ChangeFile",
"ChangeStatus",
"ChangeSummary",
"Commit",
"DiffContent",
"DiffSegment",
"Issue",
"PRSummary",
"PRType",
"PullRequest",
"Repository",
]
18 changes: 18 additions & 0 deletions codedog/models/blob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from pydantic import BaseModel, Field


class Blob(BaseModel):
"""Git blob object."""

blob_id: int = Field()
"""Blob id. Converted from sha."""
sha: str = Field()
"""Blob sha."""
content: str = Field()
"""Blob content."""
encoding: str = Field()
"""Blob content encoding."""
size: int = Field()
"""Blob content size."""
url: str = Field()
"""Blob url."""
Loading
0