8000 Add arxiv dataset to RelBench by PKUTHM · Pull Request #295 · snap-stanford/relbench · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content
8000

Add arxiv dataset to RelBench #295

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion relbench/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pooch

from relbench.base import Dataset
from relbench.datasets import amazon, avito, event, f1, hm, stack, trial
from relbench.datasets import amazon, avito, event, f1, hm, stack, trial, arxiv

dataset_registry = {}

Expand Down Expand Up @@ -98,3 +98,4 @@ def get_dataset(name: str, download=False) -> Dataset:
register_dataset("rel-hm", hm.HMDataset)
register_dataset("rel-stack", stack.StackDataset)
register_dataset("rel-trial", trial.TrialDataset)
register_dataset("rel-arxiv", arxiv.ArxivDataset)
78 changes: 78 additions & 0 deletions relbench/datasets/arxiv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import os
import pandas as pd
import pooch
from relbench.base import Database, Dataset, Table


class ArxivDataset(Dataset):
val_timestamp = pd.Timestamp("2021-01-01")
test_timestamp = pd.Timestamp("2022-01-01")

def make_db(self) -> Database:
r"""Process the raw files into a database."""

url = ("https://www.dropbox.com/scl/fi/tjj6r1fqikt4j0rz4qomu/db.zip?rlkey=1ykfkp8pj3hu6n4utz8g9dkx2&st"
"=azmm56dc&dl=1")

path = pooch.retrieve(
url,
known_hash="ff9e03e467e28df959d08c79c453db1f31b525f07ff3c0e0b5e571e732acc63f",
progressbar=True,
processor=pooch.Unzip(),
)

if isinstance(path, list):
path = os.path.dirname(path[0])

print("Final dataset directory:", path)

papers = pd.read_csv(os.path.join(path, "1Paper.csv"))
categories = pd.read_csv(os.path.join(path, "2Category.csv"))
citations = pd.read_csv(os.path.join(path, "3Citation.csv"))
paperCategories = pd.read_csv(os.path.join(path, "4Paper_Category.csv"))
authors = pd.read_csv(os.path.join(path, "5Author.csv"))
paperAuthors = pd.read_csv(os.path.join(path, "6Paper_Author.csv"))

# Convert date column to pd.Timestamp
papers["Submission_Date"] = pd.to_datetime(papers["Submission_Date"], format='%Y%m%d')
citations["Submission_Date"] = pd.to_datetime(citations["Submission_Date"], format='%Y%m%d')
paperAuthors["Submission_Date"] = pd.to_datetime(paperAuthors["Submission_Date"], format='%Y%m%d')

# add time column to other tables
paperCategories = paperCategories.merge(papers[["Paper_ID", "Submission_Date"]], on="Paper_ID", how="left")

# collect all tables in the database as relbench.base.Table objects.

tables = {"papers": Table(
df=pd.DataFrame(papers),
fkey_col_to_pkey_table={},
pkey_col="Paper_ID",
time_col="Submission_Date",
), "categories": Table(
df=pd.DataFrame(categories),
fkey_col_to_pkey_table={},
pkey_col="Category_ID",
time_col=None,
), "citations": Table(
df=pd.DataFrame(citations),
fkey_col_to_pkey_table={"Paper_ID": "papers", "References_Paper_ID": "papers"},
pkey_col=None,
time_col="Submission_Date",
), "paperCategories": Table(
df=pd.DataFrame(paperCategories),
fkey_col_to_pkey_table={"Paper_ID": "papers", "Category_ID": "categories"},
pkey_col=None,
time_col="Submission_Date",
), "authors": Table(
df=pd.DataFrame(authors),
fkey_col_to_pkey_table={},
pkey_col="Author_ID",
time_col=None,
), "paperAuthors": Table(
df=pd.DataFrame(paperAuthors),
fkey_col_to_pkey_table={"Paper_ID": "papers", "Author_ID": "authors"},
pkey_col=None,
time_col="Submission_Date",
)}

return Database(tables)
3 changes: 2 additions & 1 deletion relbench/datasets/hashes.json < 7699 div aria-live="polite" aria-atomic="true" class="sr-only" data-clipboard-copy-feedback>
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
"rel-f1/db.zip": "ec31a4e1bc2b2f9c36c05fcd3dfe2a40a506f335dc51ce79c3ec8bb40feb1482",
"rel-hm/db.zip": "3fb2aa3c978cc388aa81f43f9404fed329b502104ff58f3ee435959ab95e7029",
"rel-event/db.zip": "2593477065e1090af0258c07769ec22f40dad01f035613ccf35258b924d8b066",
"rel-amazon/db.zip": "db71c7701b892a4eb7481ff04d14d25465795501dba3a5931aabee9930805efe"
"rel-amazon/db.zip": "db71c7701b892a4eb7481ff04d14d25465795501dba3a5931aabee9930805efe",
"rel-arxiv/db.zip": "ff9e03e467e28df959d08c79c453db1f31b525f07ff3c0e0b5e571e732acc63f"
}
Loading
0