Metaculus · lsabor · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/questions/management/commands/build_forecasts.py b/questions/management/commands/build_forecasts.py
@@ -13,7 +13,7 @@ class Command(BaseCommand):
     help = "Builds forecasts for all questions"
 
     def handle(self, *args, **options):
-        qs = Question.objects.all().prefetch_related("user_forecasts")
+        qs = Question.objects.all().order_by("id").prefetch_related("user_forecasts")
         total = qs.count()
         processed = 0
         tm = time.time()
@@ -31,11 +31,13 @@ def handle(self, *args, **options):
 
             processed += 1
             print(
-                f"Processed {int(processed / total * 100)}% ({processed}/{total})"
-                f" questions. Duration: {round(time.time() - tm)}s",
+                f"Processed {int(processed / total * 100)}% ({processed}/{total}) "
+                f"Dur:{round(time.time() - tm)}s "
+                f"Est:{round((time.time() - tm) / processed * (total - processed))}s",
                 end="\r",
             )
         print(
-            f"Processed {int(processed / total * 100)}% ({processed}/{total})"
-            f" questions. Duration: {round(time.time() - tm)}s",
+            f"Processed {int(processed / total * 100)}% ({processed}/{total}) "
+            f"Dur:{round(time.time() - tm)}s "
+            f"Est:{round((time.time() - tm) / processed * (total - processed))}s"
         )
diff --git a/utils/the_math/community_prediction.py b/utils/the_math/community_prediction.py
@@ -154,55 +154,32 @@ def filter_between_dates(timestamps, start_time, end_time=None):
     return timestamps[start_index:end_index]
 
 
-def get_user_forecast_history(question: Question) -> list[ForecastSet]:
-    forecasts = question.user_forecasts.order_by("start_time").all()
-    timestamps = set()
-    for forecast in forecasts:
-        timestamps.add(forecast.start_time)
-        if forecast.end_time:
-            timestamps.add(forecast.end_time)
-
-    timestamps = sorted(timestamps)
-    output = defaultdict(list)
-
-    for forecast in forecasts:
-        # Find active timestamps
-        forecast_timestamps = filter_between_dates(
-            timestamps, forecast.start_time, forecast.end_time
-        )
-
-        for timestamp in forecast_timestamps:
-            output[timestamp].append(forecast.get_prediction_values())
-
-    return [ForecastSet(output[key], key) for key in sorted(output.keys())]
-
-
-def minimize_forecast_history(
-    forecast_history: list[AggregateForecast],
+def minimize_history(
+    history: list[datetime],
     max_size: int = 128,
-) -> list[AggregateForecast]:
-    if len(forecast_history) <= max_size:
-        return forecast_history
+) -> list[datetime]:
+    if len(history) <= max_size:
+        return history
 
-    # this is a pretty cheap algorithm that generates a minimized forecast history
-    # by taking the middle (wrt start_time) forecast of the list, then the middle
+    # this is a pretty cheap algorithm that generates a minimized history
+    # by taking the middle time of the list, then the middle
     # of the two halves, then the middle of the four quarters, etc. 7 times,
-    # generating a maximum list of 128 forecasts close evenly spaced.
+    # generating a maximum list of 128 datetimes close evenly spaced.
 
     def find_index_of_middle(forecasts: list[AggregateForecast]) -> int:
         if len(forecasts) < 3:
             return 0
-        t0 = forecasts[0].start_time
-        t2 = forecasts[-1].start_time
+        t0 = forecasts[0]
+        t2 = forecasts[-1]
         t1 = t0 + (t2 - t0) / 2
         for i, forecast in enumerate(forecasts):
-            if forecast.start_time > t1:
-                if forecast.start_time - t1 < t1 - forecasts[i - 1].start_time:
+            if forecast > t1:
+                if forecast - t1 < t1 - forecasts[i - 1]:
                     return i
                 return i - 1
 
     minimized = []
-    working_lists = [forecast_history]
+    working_lists = [history]
     for _ in range(int(np.ceil(np.log2(max_size)))):
         new_working_lists = []
         for working_list in working_lists:
@@ -215,16 +192,43 @@ def find_index_of_middle(forecasts: list[AggregateForecast]) -> int:
             new_working_lists.append(working_list[middle_index + 1 :])
         working_lists = new_working_lists
 
-    minimized: list[AggregateForecast] = sorted(minimized, key=lambda x: x.start_time)
+    minimized: list[AggregateForecast] = sorted(minimized)
     # make sure to always have the first and last forecast are the first
     # and last of the original list
-    if minimized[0].start_time != forecast_history[0].start_time:
-        minimized.insert(0, forecast_history[0])
-    if minimized[-1].start_time != forecast_history[-1].start_time:
-        minimized.append(forecast_history[-1])
+    if minimized[0] != history[0]:
+        minimized.insert(0, history[0])
+    if minimized[-1] != history[-1]:
+        minimized.append(history[-1])
     return minimized
 
 
+def get_user_forecast_history(
+    question: Question, minimize: bool = False
+) -> list[ForecastSet]:
+    forecasts = question.user_forecasts.order_by("start_time").all()
+    timestamps = set()
+    for forecast in forecasts:
+        timestamps.add(forecast.start_time)
+        if forecast.end_time:
+            timestamps.add(forecast.end_time)
+
+    timestamps = sorted(timestamps)
+    if minimize:
+        timestamps = minimize_history(timestamps)
+    output = defaultdict(list)
+
+    for forecast in forecasts:
+        # Find active timestamps
+        forecast_timestamps = filter_between_dates(
+            timestamps, forecast.start_time, forecast.end_time
+        )
+
+        for timestamp in forecast_timestamps:
+            output[timestamp].append(forecast.get_prediction_values())
+
+    return [ForecastSet(output[key], key) for key in sorted(output.keys())]
+
+
 def generate_recency_weights(number_of_forecasts: int) -> np.ndarray:
     if number_of_forecasts <= 2:
         return None
@@ -241,7 +245,7 @@ def get_cp_history(
 ) -> list[AggregateForecast]:
     full_summary: list[AggregateForecast] = []
 
-    forecast_history = get_user_forecast_history(question)
+    forecast_history = get_user_forecast_history(question, minimize=minimize)
     for i, forecast_set in enumerate(forecast_history):
         if aggregation_method == AggregationMethod.RECENCY_WEIGHTED:
             weights = generate_recency_weights(len(forecast_set.forecasts_values))
@@ -262,6 +266,4 @@ def get_cp_history(
             full_summary[-1].end_time = new_entry.start_time
         full_summary.append(new_entry)
 
-    if minimize:
-        return minimize_forecast_history(full_summary)
     return full_summary
diff --git a/utils/the_math/single_aggregation.py b/utils/the_math/single_aggregation.py
@@ -206,63 +206,32 @@ def filter_between_dates(timestamps, start_time, end_time=None):
     return timestamps[start_index:end_index]
 
 
-def get_user_forecast_history(
-    forecasts: QuerySet[Forecast],
-) -> list[ForecastSet]:
-    timestamps = set()
-    for forecast in forecasts:
-        timestamps.add(forecast.start_time)
-        if forecast.end_time:
-            timestamps.add(forecast.end_time)
-
-    timestamps = sorted(timestamps)
-    prediction_values = defaultdict(list)
-    users = defaultdict(list)
-    timesteps = defaultdict(list)
-
-    for forecast in forecasts:
-        # Find active timestamps
-        forecast_timestamps = filter_between_dates(
-            timestamps, forecast.start_time, forecast.end_time
-        )
-
-        for timestamp in forecast_timestamps:
-            prediction_values[timestamp].append((forecast.get_prediction_values()))
-            users[timestamp].append(forecast.author)
-            timesteps[timestamp].append(forecast.start_time)
-
-    return [
-        ForecastSet(prediction_values[key], key, users[key], timesteps[key])
-        for key in sorted(prediction_values.keys())
-    ]
-
-
-def minimize_forecast_history(
-    forecast_history: list[AggregateForecast],
+def minimize_history(
+    history: list[datetime],
     max_size: int = 128,
-) -> list[AggregateForecast]:
-    if len(forecast_history) <= max_size:
-        return forecast_history
+) -> list[datetime]:
+    if len(history) <= max_size:
+        return history
 
-    # this is a pretty cheap algorithm that generates a minimized forecast history
-    # by taking the middle (wrt start_time) forecast of the list, then the middle
+    # this is a pretty cheap algorithm that generates a minimized history
+    # by taking the middle time of the list, then the middle
     # of the two halves, then the middle of the four quarters, etc. 7 times,
-    # generating a maximum list of 128 forecasts close evenly spaced.
+    # generating a maximum list of 128 datetimes close evenly spaced.
 
     def find_index_of_middle(forecasts: list[AggregateForecast]) -> int:
         if len(forecasts) < 3:
             return 0
-        t0 = forecasts[0].start_time
-        t2 = forecasts[-1].start_time
+        t0 = forecasts[0]
+        t2 = forecasts[-1]
         t1 = t0 + (t2 - t0) / 2
         for i, forecast in enumerate(forecasts):
-            if forecast.start_time > t1:
-                if forecast.start_time - t1 < t1 - forecasts[i - 1].start_time:
+            if forecast > t1:
+                if forecast - t1 < t1 - forecasts[i - 1]:
                     return i
                 return i - 1
 
-    minimized: list[AggregateForecast] = []
-    working_lists = [forecast_history]
+    minimized = []
+    working_lists = [history]
     for _ in range(int(np.ceil(np.log2(max_size)))):
         new_working_lists = []
         for working_list in working_lists:
@@ -275,16 +244,43 @@ def find_index_of_middle(forecasts: list[AggregateForecast]) -> int:
            new_working_lists.append(working_list[middle_index + 1 :])
         working_lists = new_working_lists
 
-    minimized = sorted(minimized, key=lambda x: x.start_time)
+    minimized: list[AggregateForecast] = sorted(minimized)
     # make sure to always have the first and last forecast are the first
     # and last of the original list
-    if minimized[0].start_time != forecast_history[0].start_time:
-        minimized.insert(0, forecast_history[0])
-    if minimized[-1].start_time != forecast_history[-1].start_time:
-        minimized.append(forecast_history[-1])
+    if minimized[0] != history[0]:
+        minimized.insert(0, history[0])
+    if minimized[-1] != history[-1]:
+        minimized.append(history[-1])
     return minimized
 
 
+def get_user_forecast_history(
+    question: Question, minimize: bool = False
+) -> list[ForecastSet]:
+    forecasts = question.user_forecasts.order_by("start_time").all()
+    timestamps = set()
+    for forecast in forecasts:
+        timestamps.add(forecast.start_time)
+        if forecast.end_time:
+            timestamps.add(forecast.end_time)
+
+    timestamps = sorted(timestamps)
+    if minimize:
+        timestamps = minimize_history(timestamps)
+    output = defaultdict(list)
+
+    for forecast in forecasts:
+        # Find active timestamps
+        forecast_timestamps = filter_between_dates(
+            timestamps, forecast.start_time, forecast.end_time
+        )
+
+        for timestamp in forecast_timestamps:
+            output[timestamp].append(forecast.get_prediction_values())
+
+    return [ForecastSet(output[key], key) for key in sorted(output.keys())]
+
+
 def generate_recency_weights(number_of_forecasts: int) -> np.ndarray:
     if number_of_forecasts <= 2:
        return None
@@ -301,7 +297,7 @@ def get_single_aggregation_history(
     full_summary: list[AggregateForecast] = []
 
     user_forecasts = question.user_forecasts.all()
-    user_forecast_history = get_user_forecast_history(user_forecasts)
+    user_forecast_history = get_user_forecast_history(user_forecasts, minimize=minimize)
     users = list(set(forecast.author for forecast in user_forecasts))
     reputations = get_reputations_during_interval(
         users, question.open_time, question.scheduled_close_time
@@ -333,6 +329,4 @@ def get_single_aggregation_history(
             full_summary[-1].end_time = new_entry.start_time
         full_summary.append(new_entry)
 
-    if minimize:
-        return minimize_forecast_history(full_summary)
     return full_summary