8000 Cp calc speedup by lsabor · Pull Request #272 · Metaculus/metaculus · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Cp calc speedup #272

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions questions/management/commands/build_forecasts.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class Command(BaseCommand):
help = "Builds forecasts for all questions"

def handle(self, *args, **options):
qs = Question.objects.all().prefetch_related("user_forecasts")
qs = Question.objects.all().order_by("id").prefetch_related("user_forecasts")
total = qs.count()
processed = 0
tm = time.time()
Expand All @@ -31,11 +31,13 @@ def handle(self, *args, **options):

processed += 1
print(
f"Processed {int(processed / total * 100)}% ({processed}/{total})"
f" questions. Duration: {round(time.time() - tm)}s",
f"Processed {int(processed / total * 100)}% ({processed}/{total}) "
f"Dur:{round(time.time() - tm)}s "
f"Est:{round((time.time() - tm) / processed * (total - processed))}s",
end="\r",
)
print(
f"Processed {int(processed / total * 100)}% ({processed}/{total})"
f" questions. Duration: {round(time.time() - tm)}s",
f"Processed {int(processed / total * 100)}% ({processed}/{total}) "
f"Dur:{round(time.time() - tm)}s "
f"Est:{round((time.time() - tm) / processed * (total - processed))}s"
)
90 changes: 46 additions & 44 deletions utils/the_math/community_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,55 +154,32 @@ def filter_between_dates(timestamps, start_time, end_time=None):
return timestamps[start_index:end_index]


def get_user_forecast_history(question: Question) -> list[ForecastSet]:
forecasts = question.user_forecasts.order_by("start_time").all()
timestamps = set()
for forecast in forecasts:
timestamps.add(forecast.start_time)
if forecast.end_time:
timestamps.add(forecast.end_time)

timestamps = sorted(timestamps)
output = defaultdict(list)

for forecast in forecasts:
# Find active timestamps
forecast_timestamps = filter_between_dates(
timestamps, forecast.start_time, forecast.end_time
)

for timestamp in forecast_timestamps:
output[timestamp].append(forecast.get_prediction_values())

return [ForecastSet(output[key], key) for key in sorted(output.keys())]


def minimize_forecast_history(
forecast_history: list[AggregateForecast],
def minimize_history(
history: list[datetime],
max_size: int = 128,
) -> list[AggregateForecast]:
if len(forecast_history) <= max_size:
return forecast_history
) -> list[datetime]:
if len(history) <= max_size:
return history

# this is a pretty cheap algorithm that generates a minimized forecast history
# by taking the middle (wrt start_time) forecast of the list, then the middle
# this is a pretty cheap algorithm that generates a minimized history
# by taking the middle time of the list, then the middle
# of the two halves, then the middle of the four quarters, etc. 7 times,
# generating a maximum list of 128 forecasts close evenly spaced.
# generating a maximum list of 128 datetimes close evenly spaced.

def find_index_of_middle(forecasts: list[AggregateForecast]) -> int:
if len(forecasts) < 3:
return 0
t0 = forecasts[0].start_time
t2 = forecasts[-1].start_time
t0 = forecasts[0]
t2 = forecasts[-1]
t1 = t0 + (t2 - t0) / 2
for i, forecast in enumerate(forecasts):
if forecast.start_time > t1:
if forecast.start_time - t1 < t1 - forecasts[i - 1].start_time:
if forecast > t1:
if forecast - t1 < t1 - forecasts[i - 1]:
return i
return i - 1

minimized = []
working_lists = [forecast_history]
working_lists = [history]
for _ in range(int(np.ceil(np.log2(max_size)))):
new_working_lists = []
for working_list in working_lists:
Expand All @@ -215,16 +192,43 @@ def find_index_of_middle(forecasts: list[AggregateForecast]) -> int:
new_working_lists.append(working_list[middle_index + 1 :])
working_lists = new_working_lists

minimized: list[AggregateForecast] = sorted(minimized, key=lambda x: x.start_time)
minimized: list[AggregateForecast] = sorted(minimized)
# make sure to always have the first and last forecast are the first
# and last of the original list
if minimized[0].start_time != forecast_history[0].start_time:
minimized.insert(0, forecast_history[0])
if minimized[-1].start_time != forecast_history[-1].start_time:
minimized.append(forecast_history[-1])
if minimized[0] != history[0]:
minimized.insert(0, history[0])
if minimized[-1] != history[-1]:
minimized.append(history[-1])
return minimized


def get_user_forecast_history(
question: Question, minimize: bool = False
) -> list[ForecastSet]:
forecasts = question.user_forecasts.order_by("start_time").all()
timestamps = set()
for forecast in forecasts:
timestamps.add(forecast.start_time)
if forecast.end_time:
timestamps.add(forecast.end_time)

timestamps = sorted(timestamps)
if minimize:
timestamps = minimize_history(timestamps)
output = defaultdict(list)

for forecast in forecasts:
# Find active timestamps
forecast_timestamps = filter_between_dates(
timestamps, forecast.start_time, forecast.end_time
)

for timestamp in forecast_timestamps:
output[timestamp].append(forecast.get_prediction_values())

return [ForecastSet(output[key], key) for key in sorted(output.keys())]


def generate_recency_weights(number_of_forecasts: int) -> np.ndarray:
if number_of_forecasts <= 2:
return None
Expand All @@ -241,7 +245,7 @@ def get_cp_history(
) -> list[AggregateForecast]:
full_summary: list[AggregateForecast] = []

forecast_history = get_user_forecast_history(question)
forecast_history = get_user_forecast_history(question, minimize=minimize)
for i, forecast_set in enumerate(forecast_history):
if aggregation_method == AggregationMethod.RECENCY_WEIGHTED:
weights = generate_recency_weights(len(forecast_set.forecasts_values))
Expand All @@ -262,6 +266,4 @@ def get_cp_history(
full_summary[-1].end_time = new_entry.start_time
full_summary.append(new_entry)

if minimize:
return minimize_forecast_history(full_summary)
return full_summary
100 changes: 47 additions & 53 deletions utils/the_math/single_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,63 +206,32 @@ def filter_between_dates(timestamps, start_time, end_time=None):
return timestamps[start_index:end_index]


def get_user_forecast_history(
forecasts: QuerySet[Forecast],
) -> list[ForecastSet]:
timestamps = set()
for forecast in forecasts:
timestamps.add(forecast.start_time)
if forecast.end_time:
timestamps.add(forecast.end_time)

timestamps = sorted(timestamps)
prediction_values = defaultdict(list)
users = defaultdict(list)
timesteps = defaultdict(list)

for forecast in forecasts:
# Find active timestamps
forecast_timestamps = filter_between_dates(
timestamps, forecast.start_time, forecast.end_time
)

for timestamp in forecast_timestamps:
prediction_values[timestamp].append((forecast.get_prediction_values()))
users[timestamp].append(forecast.author)
timesteps[timestamp].append(forecast.start_time)

return [
ForecastSet(prediction_values[key], key, users[key], timesteps[key])
for key in sorted(prediction_values.keys())
]


def minimize_forecast_history(
forecast_history: list[AggregateForecast],
def minimize_history(
history: list[datetime],
max_size: int = 128,
) -> list[AggregateForecast]:
if len(forecast_history) <= max_size:
return forecast_history
) -> list[datetime]:
if len(history) <= max_size:
return history

# this is a pretty cheap algorithm that generates a minimized forecast history
# by taking the middle (wrt start_time) forecast of the list, then the middle
# this is a pretty cheap algorithm that generates a minimized history
# by taking the middle time of the list, then the middle
# of the two halves, then the middle of the four quarters, etc. 7 times,
# generating a maximum list of 128 forecasts close evenly spaced.
# generating a maximum list of 128 datetimes close evenly spaced.

def find_index_of_middle(forecasts: list[AggregateForecast]) -> int:
if len(forecasts) < 3:
return 0
t0 = forecasts[0].start_time
t2 = forecasts[-1].start_time
t0 = forecasts[0]
t2 = forecasts[-1]
t1 = t0 + (t2 - t0) / 2
for i, forecast in enumerate(forecasts):
if forecast.start_time > t1:
if forecast.start_time - t1 < t1 - forecasts[i - 1].start_time:
if forecast > t1:
if forecast - t1 < t1 - forecasts[i - 1]:
return i
return i - 1

minimized: list[AggregateForecast] = []
working_lists = [forecast_history]
minimized = []
working_lists = [history]
for _ in range(int(np.ceil(np.log2(max_size)))):
new_working_lists = []
for working_list in working_lists:
Expand All @@ -275,16 +244,43 @@ def find_index_of_middle(forecasts: list[AggregateForecast]) -> int:
new_working_lists.append(working_list[middle_index + 1 :])
working_lists = new_working_lists

minimized = sorted(minimized, key=lambda x: x.start_time)
minimized: list[AggregateForecast] = sorted(minimized)
# make sure to always have the first and last forecast are the first
# and last of the original list
if minimized[0].start_time != forecast_history[0].start_time:
minimized.insert(0, forecast_history[0])
if minimized[-1].start_time != forecast_history[-1].start_time:
minimized.append(forecast_history[-1])
if minimized[0] != history[0]:
minimized.insert(0, history[0])
if minimized[-1] != history[-1]:
minimized.append(history[-1])
return minimized


def get_user_forecast_history(
question: Question, minimize: bool = False
) -> list[ForecastSet]:
forecasts = question.user_forecasts.order_by("start_time").all()
timestamps = set()
for forecast in forecasts:
timestamps.add(forecast.start_time)
if forecast.end_time:
timestamps.add(forecast.end_time)

timestamps = sorted(timestamps)
if minimize:
timestamps = minimize_history(timestamps)
output = defaultdict(list)

for forecast in forecasts:
# Find active timestamps
forecast_timestamps = filter_between_dates(
timestamps, forecast.start_time, forecast.end_time
)

for timestamp in forecast_timestamps:
output[timestamp].append(forecast.get_prediction_values())

return [ForecastSet(output[key], key) for key in sorted(output.keys())]


def generate_recency_weights(number_of_forecasts: int) -> np.ndarray:
if number_of_forecasts <= 2:
return None
Expand All @@ -301,7 +297,7 @@ def get_single_aggregation_history(
full_summary: list[AggregateForecast] = []

user_forecasts = question.user_forecasts.all()
user_forecast_history = get_user_forecast_history(user_forecasts)
user_forecast_history = get_user_forecast_history(user_forecasts, minimize=minimize)
users = list(set(forecast.author for forecast in user_forecasts))
reputations = get_reputations_during_interval(
users, question.open_time, question.scheduled_close_time
Expand Down Expand Up @@ -333,6 +329,4 @@ def get_single_aggregation_history(
full_summary[-1].end_time = new_entry.start_time
full_summary.append(new_entry)

if minimize:
return minimize_forecast_history(full_summary)
return full_summary
0