From 4af3e94439b5480b6c907ea5c2b5c43c185322d8 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 04:51:46 +0000 Subject: [PATCH 1/7] renamed fall bot files and made spring bot file --- forecasting_tools/__init__.py | 12 +- forecasting_tools/forecast_bots/bot_lists.py | 12 +- .../official_bots/gpt_4_1_optimized_bot.py | 2 +- ..._bot.py => research_only_bot_2025_fall.py} | 2 +- ...plate_bot.py => template_bot_2025_fall.py} | 0 .../official_bots/template_bot_2026_spring.py | 776 ++++++++++++++++++ .../forecast_bots/template_bot.py | 4 +- run_bots.py | 6 +- 8 files changed, 795 insertions(+), 19 deletions(-) rename forecasting_tools/forecast_bots/official_bots/{fall_research_only_bot.py => research_only_bot_2025_fall.py} (99%) rename forecasting_tools/forecast_bots/official_bots/{fall_template_bot.py => template_bot_2025_fall.py} (100%) create mode 100644 forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py diff --git a/forecasting_tools/__init__.py b/forecasting_tools/__init__.py index 92b7393e..c92d905d 100644 --- a/forecasting_tools/__init__.py +++ b/forecasting_tools/__init__.py @@ -134,12 +134,6 @@ from forecasting_tools.forecast_bots.forecast_bot import ForecastBot as ForecastBot from forecasting_tools.forecast_bots.forecast_bot import Notepad as Notepad from forecasting_tools.forecast_bots.main_bot import MainBot as MainBot -from forecasting_tools.forecast_bots.official_bots.fall_research_only_bot import ( - FallResearchOnlyBot2025 as FallResearchOnlyBot2025, -) -from forecasting_tools.forecast_bots.official_bots.fall_template_bot import ( - FallTemplateBot2025 as FallTemplateBot2025, -) from forecasting_tools.forecast_bots.official_bots.gpt_4_1_optimized_bot import ( GPT41OptimizedBot as GPT41OptimizedBot, ) @@ -155,6 +149,12 @@ from forecasting_tools.forecast_bots.official_bots.q4_template_bot import ( Q4TemplateBot2024 as Q4TemplateBot2024, ) +from forecasting_tools.forecast_bots.official_bots.research_only_bot_2025_fall import ( + FallResearchOnlyBot2025 as FallResearchOnlyBot2025, +) +from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( + FallTemplateBot2025 as FallTemplateBot2025, +) from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot as UniformProbabilityBot, ) diff --git a/forecasting_tools/forecast_bots/bot_lists.py b/forecasting_tools/forecast_bots/bot_lists.py index 87141c0e..813ad9f5 100644 --- a/forecasting_tools/forecast_bots/bot_lists.py +++ b/forecasting_tools/forecast_bots/bot_lists.py @@ -8,12 +8,6 @@ from forecasting_tools.forecast_bots.experiments.q4_veritas_bot import Q4VeritasBot from forecasting_tools.forecast_bots.forecast_bot import ForecastBot from forecasting_tools.forecast_bots.main_bot import MainBot -from forecasting_tools.forecast_bots.official_bots.fall_research_only_bot import ( - FallResearchOnlyBot2025, -) -from forecasting_tools.forecast_bots.official_bots.fall_template_bot import ( - FallTemplateBot2025, -) from forecasting_tools.forecast_bots.official_bots.gpt_4_1_optimized_bot import ( GPT41OptimizedBot, ) @@ -29,6 +23,12 @@ from forecasting_tools.forecast_bots.official_bots.q4_template_bot import ( Q4TemplateBot2024, ) +from forecasting_tools.forecast_bots.official_bots.research_only_bot_2025_fall import ( + FallResearchOnlyBot2025, +) +from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( + FallTemplateBot2025, +) from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot, ) diff --git a/forecasting_tools/forecast_bots/official_bots/gpt_4_1_optimized_bot.py b/forecasting_tools/forecast_bots/official_bots/gpt_4_1_optimized_bot.py index 27ad3184..4ef4b31c 100644 --- a/forecasting_tools/forecast_bots/official_bots/gpt_4_1_optimized_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/gpt_4_1_optimized_bot.py @@ -4,7 +4,7 @@ from forecasting_tools.data_models.forecast_report import ReasonedPrediction from forecasting_tools.data_models.questions import BinaryQuestion, MetaculusQuestion -from forecasting_tools.forecast_bots.official_bots.fall_template_bot import ( +from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( FallTemplateBot2025, ) from forecasting_tools.helpers.asknews_searcher import AskNewsSearcher diff --git a/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py b/forecasting_tools/forecast_bots/official_bots/research_only_bot_2025_fall.py similarity index 99% rename from forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py rename to forecasting_tools/forecast_bots/official_bots/research_only_bot_2025_fall.py index 0ce2dadd..f3893a28 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/research_only_bot_2025_fall.py @@ -13,7 +13,7 @@ MultipleChoiceQuestion, NumericQuestion, ) -from forecasting_tools.forecast_bots.official_bots.fall_template_bot import ( +from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( FallTemplateBot2025, ) from forecasting_tools.util.misc import clean_indents diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/template_bot_2025_fall.py similarity index 100% rename from forecasting_tools/forecast_bots/official_bots/fall_template_bot.py rename to forecasting_tools/forecast_bots/official_bots/template_bot_2025_fall.py diff --git a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py new file mode 100644 index 00000000..97a98c85 --- /dev/null +++ b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py @@ -0,0 +1,776 @@ +import argparse +import asyncio +import logging +from datetime import datetime, timezone +from typing import Literal + +from forecasting_tools.agents_and_tools.research.smart_searcher import SmartSearcher +from forecasting_tools.ai_models.general_llm import GeneralLlm +from forecasting_tools.data_models.binary_report import BinaryPrediction +from forecasting_tools.data_models.conditional_models import ( + ConditionalPrediction, + PredictionAffirmed, +) +from forecasting_tools.data_models.data_organizer import PredictionTypes +from forecasting_tools.data_models.forecast_report import ReasonedPrediction +from forecasting_tools.data_models.multiple_choice_report import PredictedOptionList +from forecasting_tools.data_models.numeric_report import ( + DatePercentile, + NumericDistribution, + Percentile, +) +from forecasting_tools.data_models.questions import ( + BinaryQuestion, + ConditionalQuestion, + DateQuestion, + MetaculusQuestion, + MultipleChoiceQuestion, + NumericQuestion, +) +from forecasting_tools.forecast_bots.forecast_bot import ForecastBot +from forecasting_tools.helpers.asknews_searcher import AskNewsSearcher +from forecasting_tools.helpers.metaculus_client import MetaculusClient +from forecasting_tools.helpers.prediction_extractor import PredictionExtractor +from forecasting_tools.helpers.structure_output import structure_output +from forecasting_tools.util.misc import clean_indents + +logger = logging.getLogger(__name__) + + +class SpringTemplateBot2026(ForecastBot): + """ + This is a copy of the template bot for Spring 2026 Metaculus AI Tournament. + This bot is what is used by Metaculus in our benchmark, but is also provided as a template for new bot makers. + This template is given as-is, and though we have covered most test cases + in forecasting-tools it may be worth double checking key components locally. + + Main changes since Q2: + - An LLM now parses the final forecast output (rather than programmatic parsing) + - Added resolution criteria and fine print explicitly to the research prompt + - Previously in the prompt, nothing about upper/lower bound was shown when the bounds were open. Now a suggestion is made when this is the case. + - Support for nominal bounds was added (i.e. when there are discrete questions and normal upper/lower bounds are not as intuitive) + + The main entry point of this bot is `forecast_on_tournament` in the parent class. + See the script at the bottom of the file for more details on how to run the bot. + Ignoring the finer details, the general flow is: + - Load questions from Metaculus + - For each question + - Execute run_research a number of times equal to research_reports_per_question + - Execute respective run_forecast function `predictions_per_research_report * research_reports_per_question` times + - Aggregate the predictions + - Submit prediction (if publish_reports_to_metaculus is True) + - Return a list of ForecastReport objects + + Only the research and forecast functions need to be implemented in ForecastBot subclasses, + though you may want to override other ones. + In this example, you can change the prompts to be whatever you want since, + structure_output uses an LLMto intelligently reformat the output into the needed structure. + + By default (i.e. 'tournament' mode), when you run this script, it will forecast on any open questions for the + MiniBench and Seasonal AIB tournaments. If you want to forecast on only one or the other, you can remove one + of them from the 'tournament' mode code at the bottom of the file. + + You can experiment with what models work best with your bot by using the `llms` parameter when initializing the bot. + You can initialize the bot with any number of models. For example, + ```python + my_bot = MyBot( + ... + llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you + "default": GeneralLlm( + model="openrouter/openai/gpt-4o", # "anthropic/claude-3-5-sonnet-20241022", etc (see docs for litellm) + temperature=0.3, + timeout=40, + allowed_tries=2, + ), + "summarizer": "openai/gpt-4o-mini", + "researcher": "asknews/deep-research/low", + "parser": "openai/gpt-4o-mini", + }, + ) + ``` + + Then you can access the model in custom functions like this: + ```python + research_strategy = self.get_llm("researcher", "model_name" + if research_strategy == "asknews/deep-research/low": + ... + # OR + summarizer = await self.get_llm("summarizer", "model_name").invoke(prompt) + # OR + reasoning = await self.get_llm("default", "llm").invoke(prompt) + ``` + + If you end up having trouble with rate limits and want to try a more sophisticated rate limiter try: + ```python + from forecasting_tools import RefreshingBucketRateLimiter + rate_limiter = RefreshingBucketRateLimiter( + capacity=2, + refresh_rate=1, + ) # Allows 1 request per second on average with a burst of 2 requests initially. Set this as a class variable + await self.rate_limiter.wait_till_able_to_acquire_resources(1) # 1 because it's consuming 1 request (use more if you are adding a token limit) + ``` + Additionally OpenRouter has large rate limits immediately on account creation + """ + + _max_concurrent_questions = ( + 1 # Set this to whatever works for your search-provider/ai-model rate limits + ) + _concurrency_limiter = asyncio.Semaphore(_max_concurrent_questions) + _structure_output_validation_samples = 2 + + async def run_research(self, question: MetaculusQuestion) -> str: + async with self._concurrency_limiter: + research = "" + researcher = self.get_llm("researcher") + + prompt = clean_indents( + f""" + You are an assistant to a superforecaster. + The superforecaster will give you a question they intend to forecast on. + To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information. + You do not produce forecasts yourself. + + Question: + {question.question_text} + + This question's outcome will be determined by the specific criteria below: + {question.resolution_criteria} + + {question.fine_print} + """ + ) + + if isinstance(researcher, GeneralLlm): + research = await researcher.invoke(prompt) + elif ( + researcher == "asknews/news-summaries" + or researcher == "asknews/deep-research/medium-depth" + or researcher == "asknews/deep-research/high-depth" + ): + research = await AskNewsSearcher().call_preconfigured_version( + researcher, prompt + ) + elif researcher.startswith("smart-searcher"): + model_name = researcher.removeprefix("smart-searcher/") + searcher = SmartSearcher( + model=model_name, + temperature=0, + num_searches_to_run=2, + num_sites_per_search=10, + use_advanced_filters=False, + ) + research = await searcher.invoke(prompt) + elif not researcher or researcher == "None" or researcher == "no_research": + research = "" + else: + research = await self.get_llm("researcher", "llm").invoke(prompt) + logger.info(f"Found Research for URL {question.page_url}:\n{research}") + return research + + def _add_reasoning_to_research( + self, + research: str, + reasoning: ReasonedPrediction[PredictionTypes], + question_type: str, + ) -> str: + from forecasting_tools.data_models.data_organizer import DataOrganizer + + question_type = question_type.title() + return clean_indents( + f""" + {research} + --- + ## {question_type} Question Information + You have previously forecasted the {question_type} Question to the value: {DataOrganizer.get_readable_prediction(reasoning.prediction_value)} + This is relevant information for your current forecast, but it is NOT your current forecast, but previous forecasting information that is relevant to your current forecast. + The reasoning for the {question_type} Question was as such: + ``` + {reasoning.reasoning} + ``` + This is absolutely essential: do NOT use this reasoning to re-forecast the {question_type} question. + """ + ) + + async def _get_question_prediction_info( + self, question: MetaculusQuestion, research: str, question_type: str + ) -> tuple[ReasonedPrediction[PredictionTypes], str]: + from forecasting_tools.data_models.data_organizer import DataOrganizer + + previous_forecasts = question.previous_forecasts + if ( + question_type in ["parent", "child"] + and previous_forecasts + and question_type not in self.force_reforecast_in_conditional + ): + # TODO: add option to not affirm current parent/child forecasts, create new forecast + previous_forecast = previous_forecasts[-1] + current_utc_time = datetime.now(timezone.utc) + if ( + previous_forecast.timestamp_end is None + or previous_forecast.timestamp_end > current_utc_time + ): + pretty_value = DataOrganizer.get_readable_prediction(previous_forecast) + return ( + ReasonedPrediction( + prediction_value=PredictionAffirmed(), + reasoning=f"Already existing forecast reaffirmed at {pretty_value}.", + ), + research, + ) + info = await self._make_prediction(question, research) + full_research = self._add_reasoning_to_research(research, info, question_type) + return info, full_research + + async def _run_forecast_on_conditional( + self, question: ConditionalQuestion, research: str + ) -> ReasonedPrediction[ConditionalPrediction]: + parent_info, full_research = await self._get_question_prediction_info( + question.parent, research, "parent" + ) + child_info, full_research = await self._get_question_prediction_info( + question.child, research, "child" + ) + yes_info, full_research = await self._get_question_prediction_info( + question.question_yes, full_research, "yes" + ) + no_info, full_research = await self._get_question_prediction_info( + question.question_no, full_research, "no" + ) + full_reasoning = clean_indents( + f""" + ## Parent Question Reasoning + {parent_info.reasoning} + ## Child Question Reasoning + {child_info.reasoning} + ## Yes Question Reasoning + {yes_info.reasoning} + ## No Question Reasoning + {no_info.reasoning} + """ + ) + full_prediction = ConditionalPrediction( + parent=parent_info.prediction_value, + child=child_info.prediction_value, + prediction_yes=yes_info.prediction_value, + prediction_no=no_info.prediction_value, + ) + return ReasonedPrediction( + reasoning=full_reasoning, prediction_value=full_prediction + ) + + async def _run_forecast_on_binary( + self, question: BinaryQuestion, research: str + ) -> ReasonedPrediction[float]: + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + Question background: + {question.background_info} + + + This question's outcome will be determined by the specific criteria below. These criteria have not yet been satisfied: + {question.resolution_criteria} + + {question.fine_print} + + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + Before answering you write: + (a) The time left until the outcome to the question is known. + (b) The status quo outcome if nothing changed. + (c) A brief description of a scenario that results in a No outcome. + (d) A brief description of a scenario that results in a Yes outcome. + + You write your rationale remembering that good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time. + {self._get_conditional_disclaimer_if_necessary(question)} + The last thing you write is your final answer as: "Probability: ZZ%", 0-100 + """ + ) + + return await self._binary_prompt_to_forecast(question, prompt) + + async def _binary_prompt_to_forecast( + self, + question: BinaryQuestion, + prompt: str, + double_check_extraction: bool = False, + ) -> ReasonedPrediction[float]: + reasoning = await self.get_llm("default", "llm").invoke(prompt) + logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + binary_prediction: BinaryPrediction = await structure_output( + reasoning, + BinaryPrediction, + model=self.get_llm("parser", "llm"), + num_validation_samples=self._structure_output_validation_samples, + ) + if double_check_extraction: + redundant_extraction = PredictionExtractor.extract_last_percentage_value( + reasoning + ) + assert ( + abs(redundant_extraction - binary_prediction.prediction_in_decimal) + < 0.001 + ), f"Redundant extraction {redundant_extraction} does not match binary prediction {binary_prediction.prediction_in_decimal}" + decimal_pred = max(0.01, min(0.99, binary_prediction.prediction_in_decimal)) + + logger.info( + f"Forecasted URL {question.page_url} with prediction: {decimal_pred}." + ) + return ReasonedPrediction(prediction_value=decimal_pred, reasoning=reasoning) + + async def _run_forecast_on_multiple_choice( + self, question: MultipleChoiceQuestion, research: str + ) -> ReasonedPrediction[PredictedOptionList]: + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + The options are: {question.options} + + + Background: + {question.background_info} + + {question.resolution_criteria} + + {question.fine_print} + + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + Before answering you write: + (a) The time left until the outcome to the question is known. + (b) The status quo outcome if nothing changed. + (c) A description of an scenario that results in an unexpected outcome. + + {self._get_conditional_disclaimer_if_necessary(question)} + You write your rationale remembering that (1) good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time, and (2) good forecasters leave some moderate probability on most options to account for unexpected outcomes. + + The last thing you write is your final probabilities for the N options in this order {question.options} as: + Option_A: Probability_A + Option_B: Probability_B + ... + Option_N: Probability_N + """ + ) + return await self._multiple_choice_prompt_to_forecast(question, prompt) + + async def _multiple_choice_prompt_to_forecast( + self, + question: MultipleChoiceQuestion, + prompt: str, + double_check_extraction: bool = False, + ) -> ReasonedPrediction[PredictedOptionList]: + parsing_instructions = clean_indents( + f""" + Make sure that all option names are one of the following: + {question.options} + + The text you are parsing may prepend these options with some variation of "Option" which you should remove if not part of the option names I just gave you. + Additionally, you may sometimes need to parse a 0% probability. Please do not skip options with 0% but rather make it an entry in your final list with 0% probability. + """ + ) + reasoning = await self.get_llm("default", "llm").invoke(prompt) + logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + predicted_option_list: PredictedOptionList = await structure_output( + text_to_structure=reasoning, + output_type=PredictedOptionList, + model=self.get_llm("parser", "llm"), + num_validation_samples=self._structure_output_validation_samples, + additional_instructions=parsing_instructions, + ) + if double_check_extraction: + redundant_extraction = ( + PredictionExtractor.extract_option_list_with_percentage_afterwards( + reasoning, question.options + ) + ) + for redundant_prediction in redundant_extraction.predicted_options: + matching_original_option = next( + ( + option + for option in predicted_option_list.predicted_options + if option.option_name == redundant_prediction.option_name + ), + None, + ) + assert ( + matching_original_option is not None + ), f"Matching original option not found for {redundant_prediction.option_name}" + assert ( + abs( + redundant_prediction.probability + - matching_original_option.probability + ) + < 0.001 + ), f"Redundant extraction {redundant_prediction.probability} does not match original option {matching_original_option.probability} for option {redundant_prediction.option_name}" + + logger.info( + f"Forecasted URL {question.page_url} with prediction: {predicted_option_list}." + ) + return ReasonedPrediction( + prediction_value=predicted_option_list, reasoning=reasoning + ) + + async def _run_forecast_on_numeric( + self, question: NumericQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + Background: + {question.background_info} + + {question.resolution_criteria} + + {question.fine_print} + + Units for answer: {question.unit_of_measure if question.unit_of_measure else "Not stated (please infer this)"} + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + {lower_bound_message} + {upper_bound_message} + + Formatting Instructions: + - Please notice the units requested and give your answer in these units (e.g. whether you represent a number as 1,000,000 or 1 million). + - Never use scientific notation. + - Always start with a smaller number (more negative if negative) and then increase from there + + Before answering you write: + (a) The time left until the outcome to the question is known. + (b) The outcome if nothing changed. + (c) The outcome if the current trend continued. + (d) The expectations of experts and markets. + (e) A brief description of an unexpected scenario that results in a low outcome. + (f) A brief description of an unexpected scenario that results in a high outcome. + + {self._get_conditional_disclaimer_if_necessary(question)} + You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. + + The last thing you write is your final answer as: + " + Percentile 10: XX + Percentile 20: XX + Percentile 40: XX + Percentile 60: XX + Percentile 80: XX + Percentile 90: XX + " + """ + ) + return await self._numeric_prompt_to_forecast(question, prompt) + + async def _numeric_prompt_to_forecast( + self, + question: NumericQuestion, + prompt: str, + double_check_extraction: bool = False, + ) -> ReasonedPrediction[NumericDistribution]: + reasoning = await self.get_llm("default", "llm").invoke(prompt) + logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + parsing_instructions = clean_indents( + f""" + The text given to you is trying to give a forecast distribution for a numeric question. + - This text is trying to answer the numeric question: "{question.question_text}". + - When parsing the text, please make sure to give the values (the ones assigned to percentiles) in terms of the correct units. + - The units for the forecast are: {question.unit_of_measure} + - Your work will be shown publicly with these units stated verbatim after the numbers your parse. + - As an example, someone else guessed that the answer will be between {question.lower_bound} {question.unit_of_measure} and {question.upper_bound} {question.unit_of_measure}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". + - If the answer doesn't give the answer in the correct units, you should parse it in the right units. For instance if the answer gives numbers as $500,000,000 and units are "B $" then you should parse the answer as 0.5 (since $500,000,000 is $0.5 billion). + - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. + - Turn any values that are in scientific notation into regular numbers. + """ + ) + percentile_list: list[Percentile] = await structure_output( + reasoning, + list[Percentile], + model=self.get_llm("parser", "llm"), + additional_instructions=parsing_instructions, + num_validation_samples=self._structure_output_validation_samples, + ) + + if double_check_extraction: + redundant_extraction = PredictionExtractor.extract_numeric_distribution_from_list_of_percentile_number_and_probability( + reasoning, question + ) + for redundant_percentile in redundant_extraction.declared_percentiles: + matching_original_percentile = next( + ( + percentile + for percentile in percentile_list + if abs(percentile.percentile - redundant_percentile.percentile) + < 0.001 + ), + None, + ) + assert ( + matching_original_percentile is not None + ), f"Matching original percentile not found for {redundant_percentile.percentile}" + assert ( + abs(redundant_percentile.value - matching_original_percentile.value) + < 0.001 + ), f"Redundant extraction {redundant_percentile.value} does not match original percentile {matching_original_percentile.value} for percentile {redundant_percentile.percentile}" + prediction = NumericDistribution.from_question(percentile_list, question) + logger.info( + f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." + ) + return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) + + async def _run_forecast_on_date( + self, question: DateQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + Background: + {question.background_info} + + {question.resolution_criteria} + + {question.fine_print} + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + {lower_bound_message} + {upper_bound_message} + + Formatting Instructions: + - This is a date question, and as such, the answer must be expressed in terms of dates. + - The dates must be written in the format of YYYY-MM-DD. If hours matter, please append the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ.No other formatting is allowed. + - Always start with a lower date chronologically and then increase from there. + - Do NOT forget this. The dates must be written in chronological order starting at the earliest time at percentile 10 and increasing from there. + + Before answering you write: + (a) The time left until the outcome to the question is known. + (b) The outcome if nothing changed. + (c) The outcome if the current trend continued. + (d) The expectations of experts and markets. + (e) A brief description of an unexpected scenario that results in a low outcome. + (f) A brief description of an unexpected scenario that results in a high outcome. + + {self._get_conditional_disclaimer_if_necessary(question)} + You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. + + The last thing you write is your final answer as: + " + Percentile 10: YYYY-MM-DD (oldest date) + Percentile 20: YYYY-MM-DD + Percentile 40: YYYY-MM-DD + Percentile 60: YYYY-MM-DD + Percentile 80: YYYY-MM-DD + Percentile 90: YYYY-MM-DD (newest date) + " + """ + ) + forecast = await self._date_prompt_to_forecast(question, prompt) + return forecast + + async def _date_prompt_to_forecast( + self, + question: DateQuestion, + prompt: str, + double_check_extraction: bool = False, + ) -> ReasonedPrediction[NumericDistribution]: + reasoning = await self.get_llm("default", "llm").invoke(prompt) + logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + parsing_instructions = clean_indents( + f""" + The text given to you is trying to give a forecast distribution for a date question. + - This text is trying to answer the question: "{question.question_text}". + - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". + - The output is given as dates/times please format it into a valid datetime parsable string. Assume midnight UTC if no hour is given. + - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. + """ + ) + date_percentile_list: list[DatePercentile] = await structure_output( + reasoning, + list[DatePercentile], + model=self.get_llm("parser", "llm"), + additional_instructions=parsing_instructions, + num_validation_samples=self._structure_output_validation_samples, + ) + + percentile_list = [ + Percentile( + percentile=percentile.percentile, + value=percentile.value.timestamp(), + ) + for percentile in date_percentile_list + ] + + if double_check_extraction: + raise ValueError("Double check extraction not supported for date questions") + + prediction = NumericDistribution.from_question(percentile_list, question) + logger.info( + f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." + ) + return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) + + def _create_upper_and_lower_bound_messages( + self, question: NumericQuestion | DateQuestion + ) -> tuple[str, str]: + if isinstance(question, NumericQuestion): + if question.nominal_upper_bound is not None: + upper_bound_number = question.nominal_upper_bound + else: + upper_bound_number = question.upper_bound + if question.nominal_lower_bound is not None: + lower_bound_number = question.nominal_lower_bound + else: + lower_bound_number = question.lower_bound + unit_of_measure = question.unit_of_measure + elif isinstance(question, DateQuestion): + upper_bound_number = question.upper_bound.date().isoformat() + lower_bound_number = question.lower_bound.date().isoformat() + unit_of_measure = "" + else: + raise ValueError() + + if question.open_upper_bound: + upper_bound_message = f"The question creator thinks the number is likely not higher than {upper_bound_number} {unit_of_measure}." + else: + upper_bound_message = f"The outcome can not be higher than {upper_bound_number} {unit_of_measure}." + + if question.open_lower_bound: + lower_bound_message = f"The question creator thinks the number is likely not lower than {lower_bound_number} {unit_of_measure}." + else: + lower_bound_message = f"The outcome can not be lower than {lower_bound_number} {unit_of_measure}." + return upper_bound_message, lower_bound_message + + def _get_conditional_disclaimer_if_necessary( + self, question: MetaculusQuestion + ) -> str: + if question.conditional_type not in ["yes", "no"]: + return "" + return clean_indents( + """ + As you are given a conditional question with a parent and child, you are to only forecast the **CHILD** question, given the parent question's resolution. + You never re-forecast the parent question under any circumstances, but you use probabilistic reasoning, strongly considering the parent question's resolution, to forecast the child question. + """ + ) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + # Suppress LiteLLM logging + litellm_logger = logging.getLogger("LiteLLM") + litellm_logger.setLevel(logging.WARNING) + litellm_logger.propagate = False + + parser = argparse.ArgumentParser( + description="Run the Q1TemplateBot forecasting system" + ) + parser.add_argument( + "--mode", + type=str, + choices=["tournament", "metaculus_cup", "test_questions"], + default="tournament", + help="Specify the run mode (default: tournament)", + ) + args = parser.parse_args() + run_mode: Literal["tournament", "metaculus_cup", "test_questions"] = args.mode + assert run_mode in [ + "tournament", + "metaculus_cup", + "test_questions", + ], "Invalid run mode" + + template_bot = SpringTemplateBot2026( + research_reports_per_question=1, + predictions_per_research_report=5, + use_research_summary_to_forecast=False, + publish_reports_to_metaculus=True, + folder_to_save_reports_to=None, + skip_previously_forecasted_questions=True, + # llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you + # "default": GeneralLlm( + # model="openrouter/openai/gpt-4o", # "anthropic/claude-3-5-sonnet-20241022", etc (see docs for litellm) + # temperature=0.3, + # timeout=40, + # allowed_tries=2, + # ), + # "summarizer": "openai/gpt-4o-mini", + # "researcher": "asknews/deep-research/low", + # "parser": "openai/gpt-4o-mini", + # }, + ) + + client = MetaculusClient() + if run_mode == "tournament": + seasonal_tournament_reports = asyncio.run( + template_bot.forecast_on_tournament( + client.CURRENT_AI_COMPETITION_ID, return_exceptions=True + ) + ) + minibench_reports = asyncio.run( + template_bot.forecast_on_tournament( + client.CURRENT_MINIBENCH_ID, return_exceptions=True + ) + ) + forecast_reports = seasonal_tournament_reports + minibench_reports + elif run_mode == "metaculus_cup": + # The Metaculus cup is a good way to test the bot's performance on regularly open questions. You can also use AXC_2025_TOURNAMENT_ID = 32564 or AI_2027_TOURNAMENT_ID = "ai-2027" + # The Metaculus cup may not be initialized near the beginning of a season (i.e. January, May, September) + template_bot.skip_previously_forecasted_questions = False + forecast_reports = asyncio.run( + template_bot.forecast_on_tournament( + client.CURRENT_METACULUS_CUP_ID, return_exceptions=True + ) + ) + elif run_mode == "test_questions": + # Example questions are a good way to test the bot's performance on a single question + EXAMPLE_QUESTIONS = [ + "https://www.metaculus.com/questions/578/human-extinction-by-2100/", # Human Extinction - Binary + "https://www.metaculus.com/questions/14333/age-of-oldest-human-as-of-2100/", # Age of Oldest Human - Numeric + "https://www.metaculus.com/questions/22427/number-of-new-leading-ai-labs/", # Number of New Leading AI Labs - Multiple Choice + "https://www.metaculus.com/c/diffusion-community/38880/how-many-us-labor-strikes-due-to-ai-in-2029/", # Number of US Labor Strikes Due to AI in 2029 - Discrete + ] + template_bot.skip_previously_forecasted_questions = False + questions = [ + client.get_question_by_url(question_url) + for question_url in EXAMPLE_QUESTIONS + ] + forecast_reports = asyncio.run( + template_bot.forecast_questions(questions, return_exceptions=True) + ) + template_bot.log_report_summary(forecast_reports) diff --git a/forecasting_tools/forecast_bots/template_bot.py b/forecasting_tools/forecast_bots/template_bot.py index 02d13f89..b5e7393b 100644 --- a/forecasting_tools/forecast_bots/template_bot.py +++ b/forecasting_tools/forecast_bots/template_bot.py @@ -1,7 +1,7 @@ -from forecasting_tools.forecast_bots.official_bots.fall_research_only_bot import ( +from forecasting_tools.forecast_bots.official_bots.research_only_bot_2025_fall import ( FallResearchOnlyBot2025, ) -from forecasting_tools.forecast_bots.official_bots.fall_template_bot import ( +from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( FallTemplateBot2025, ) diff --git a/run_bots.py b/run_bots.py index 522d10e8..b3fb197e 100644 --- a/run_bots.py +++ b/run_bots.py @@ -20,12 +20,12 @@ from forecasting_tools.data_models.forecast_report import ForecastReport from forecasting_tools.data_models.questions import DateQuestion, MetaculusQuestion from forecasting_tools.forecast_bots.forecast_bot import ForecastBot -from forecasting_tools.forecast_bots.official_bots.fall_research_only_bot import ( - FallResearchOnlyBot2025, -) from forecasting_tools.forecast_bots.official_bots.gpt_4_1_optimized_bot import ( GPT41OptimizedBot, ) +from forecasting_tools.forecast_bots.official_bots.research_only_bot_2025_fall import ( + FallResearchOnlyBot2025, +) from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot, ) From 355ba7c289ff5808fdf0daa4abf746a85e3f49dd Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 06:23:42 +0000 Subject: [PATCH 2/7] Updates to spring template bot --- forecasting_tools/__init__.py | 3 + forecasting_tools/forecast_bots/bot_lists.py | 4 + .../official_bots/template_bot_2026_spring.py | 309 ++++++++---------- .../forecast_bots/template_bot.py | 6 +- forecasting_tools/helpers/metaculus_client.py | 4 +- pyproject.toml | 2 +- 6 files changed, 145 insertions(+), 183 deletions(-) diff --git a/forecasting_tools/__init__.py b/forecasting_tools/__init__.py index c92d905d..23c64448 100644 --- a/forecasting_tools/__init__.py +++ b/forecasting_tools/__init__.py @@ -155,6 +155,9 @@ from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( FallTemplateBot2025 as FallTemplateBot2025, ) +from forecasting_tools.forecast_bots.official_bots.template_bot_2026_spring import ( + SpringTemplateBot2026 as SpringTemplateBot2026, +) from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot as UniformProbabilityBot, ) diff --git a/forecasting_tools/forecast_bots/bot_lists.py b/forecasting_tools/forecast_bots/bot_lists.py index 813ad9f5..c1e73f65 100644 --- a/forecasting_tools/forecast_bots/bot_lists.py +++ b/forecasting_tools/forecast_bots/bot_lists.py @@ -29,6 +29,9 @@ from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( FallTemplateBot2025, ) +from forecasting_tools.forecast_bots.official_bots.template_bot_2026_spring import ( + SpringTemplateBot2026, +) from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot, ) @@ -51,6 +54,7 @@ def get_all_important_bot_classes() -> list[type[ForecastBot]]: Q2TemplateBotWithDecompositionV1, Q2TemplateBotWithDecompositionV2, FallResearchOnlyBot2025, + SpringTemplateBot2026, GPT41OptimizedBot, ] diff --git a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py index 97a98c85..16626004 100644 --- a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py +++ b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py @@ -30,7 +30,6 @@ from forecasting_tools.forecast_bots.forecast_bot import ForecastBot from forecasting_tools.helpers.asknews_searcher import AskNewsSearcher from forecasting_tools.helpers.metaculus_client import MetaculusClient -from forecasting_tools.helpers.prediction_extractor import PredictionExtractor from forecasting_tools.helpers.structure_output import structure_output from forecasting_tools.util.misc import clean_indents @@ -39,18 +38,18 @@ class SpringTemplateBot2026(ForecastBot): """ - This is a copy of the template bot for Spring 2026 Metaculus AI Tournament. - This bot is what is used by Metaculus in our benchmark, but is also provided as a template for new bot makers. - This template is given as-is, and though we have covered most test cases - in forecasting-tools it may be worth double checking key components locally. - - Main changes since Q2: - - An LLM now parses the final forecast output (rather than programmatic parsing) - - Added resolution criteria and fine print explicitly to the research prompt - - Previously in the prompt, nothing about upper/lower bound was shown when the bounds were open. Now a suggestion is made when this is the case. - - Support for nominal bounds was added (i.e. when there are discrete questions and normal upper/lower bounds are not as intuitive) - - The main entry point of this bot is `forecast_on_tournament` in the parent class. + This is the template bot for Spring 2026 Metaculus AI Tournament. + This is a copy of what is used by Metaculus to run the Metac Bots in our benchmark, provided as a template for new bot makers. + This template is given as-is, and is use-at-your-own-risk. + We have covered most test cases in forecasting-tools it may be worth double checking key components locally. + So far our track record has been 1 mentionable bug per season (affecting forecasts for 1-2% of total questions) + + Main changes since Fall: + - Additional prompting has been added to numeric questions to emphasize putting pecentile values in the correct order. + - Support for conditional and date questions has been added + - Note: Spring AIB will not use date/conditional questions, so these are only for forecasting on the main site as you wish. + + The main entry point of this bot is `bot.forecast_on_tournament(tournament_id)` in the parent class. See the script at the bottom of the file for more details on how to run the bot. Ignoring the finer details, the general flow is: - Load questions from Metaculus @@ -61,13 +60,16 @@ class SpringTemplateBot2026(ForecastBot): - Submit prediction (if publish_reports_to_metaculus is True) - Return a list of ForecastReport objects + Alternatively, you can use the MetaculusClient to make a custom filter of questions to forecast on + and forecast them with `bot.forecast_questions(questions)` + Only the research and forecast functions need to be implemented in ForecastBot subclasses, - though you may want to override other ones. + though you may want to override other ForecastBot functions. In this example, you can change the prompts to be whatever you want since, - structure_output uses an LLMto intelligently reformat the output into the needed structure. + structure_output uses an LLM to intelligently reformat the output into the needed structure. - By default (i.e. 'tournament' mode), when you run this script, it will forecast on any open questions for the - MiniBench and Seasonal AIB tournaments. If you want to forecast on only one or the other, you can remove one + By default (i.e. 'tournament' mode), when you run this script, it will forecast on any open questions in the + primary bot tournament and MiniBench. If you want to forecast on only one or the other, you can remove one of them from the 'tournament' mode code at the bottom of the file. You can experiment with what models work best with your bot by using the `llms` parameter when initializing the bot. @@ -83,7 +85,7 @@ class SpringTemplateBot2026(ForecastBot): allowed_tries=2, ), "summarizer": "openai/gpt-4o-mini", - "researcher": "asknews/deep-research/low", + "researcher": "asknews/news-summaries", "parser": "openai/gpt-4o-mini", }, ) @@ -92,10 +94,10 @@ class SpringTemplateBot2026(ForecastBot): Then you can access the model in custom functions like this: ```python research_strategy = self.get_llm("researcher", "model_name" - if research_strategy == "asknews/deep-research/low": + if research_strategy == "asknews/news-summaries": ... # OR - summarizer = await self.get_llm("summarizer", "model_name").invoke(prompt) + summarizer = await self.get_llm("summarizer", "llm").invoke(prompt) # OR reasoning = await self.get_llm("default", "llm").invoke(prompt) ``` @@ -118,6 +120,8 @@ class SpringTemplateBot2026(ForecastBot): _concurrency_limiter = asyncio.Semaphore(_max_concurrent_questions) _structure_output_validation_samples = 2 + ##################################### RESEARCH ##################################### + async def run_research(self, question: MetaculusQuestion) -> str: async with self._concurrency_limiter: research = "" @@ -144,6 +148,7 @@ async def run_research(self, question: MetaculusQuestion) -> str: research = await researcher.invoke(prompt) elif ( researcher == "asknews/news-summaries" + or researcher == "asknews/deep-research/low-depth" or researcher == "asknews/deep-research/medium-depth" or researcher == "asknews/deep-research/high-depth" ): @@ -167,96 +172,7 @@ async def run_research(self, question: MetaculusQuestion) -> str: logger.info(f"Found Research for URL {question.page_url}:\n{research}") return research - def _add_reasoning_to_research( - self, - research: str, - reasoning: ReasonedPrediction[PredictionTypes], - question_type: str, - ) -> str: - from forecasting_tools.data_models.data_organizer import DataOrganizer - - question_type = question_type.title() - return clean_indents( - f""" - {research} - --- - ## {question_type} Question Information - You have previously forecasted the {question_type} Question to the value: {DataOrganizer.get_readable_prediction(reasoning.prediction_value)} - This is relevant information for your current forecast, but it is NOT your current forecast, but previous forecasting information that is relevant to your current forecast. - The reasoning for the {question_type} Question was as such: - ``` - {reasoning.reasoning} - ``` - This is absolutely essential: do NOT use this reasoning to re-forecast the {question_type} question. - """ - ) - - async def _get_question_prediction_info( - self, question: MetaculusQuestion, research: str, question_type: str - ) -> tuple[ReasonedPrediction[PredictionTypes], str]: - from forecasting_tools.data_models.data_organizer import DataOrganizer - - previous_forecasts = question.previous_forecasts - if ( - question_type in ["parent", "child"] - and previous_forecasts - and question_type not in self.force_reforecast_in_conditional - ): - # TODO: add option to not affirm current parent/child forecasts, create new forecast - previous_forecast = previous_forecasts[-1] - current_utc_time = datetime.now(timezone.utc) - if ( - previous_forecast.timestamp_end is None - or previous_forecast.timestamp_end > current_utc_time - ): - pretty_value = DataOrganizer.get_readable_prediction(previous_forecast) - return ( - ReasonedPrediction( - prediction_value=PredictionAffirmed(), - reasoning=f"Already existing forecast reaffirmed at {pretty_value}.", - ), - research, - ) - info = await self._make_prediction(question, research) - full_research = self._add_reasoning_to_research(research, info, question_type) - return info, full_research - - async def _run_forecast_on_conditional( - self, question: ConditionalQuestion, research: str - ) -> ReasonedPrediction[ConditionalPrediction]: - parent_info, full_research = await self._get_question_prediction_info( - question.parent, research, "parent" - ) - child_info, full_research = await self._get_question_prediction_info( - question.child, research, "child" - ) - yes_info, full_research = await self._get_question_prediction_info( - question.question_yes, full_research, "yes" - ) - no_info, full_research = await self._get_question_prediction_info( - question.question_no, full_research, "no" - ) - full_reasoning = clean_indents( - f""" - ## Parent Question Reasoning - {parent_info.reasoning} - ## Child Question Reasoning - {child_info.reasoning} - ## Yes Question Reasoning - {yes_info.reasoning} - ## No Question Reasoning - {no_info.reasoning} - """ - ) - full_prediction = ConditionalPrediction( - parent=parent_info.prediction_value, - child=child_info.prediction_value, - prediction_yes=yes_info.prediction_value, - prediction_no=no_info.prediction_value, - ) - return ReasonedPrediction( - reasoning=full_reasoning, prediction_value=full_prediction - ) + ##################################### BINARY QUESTIONS ##################################### async def _run_forecast_on_binary( self, question: BinaryQuestion, research: str @@ -291,6 +207,7 @@ async def _run_forecast_on_binary( You write your rationale remembering that good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time. {self._get_conditional_disclaimer_if_necessary(question)} + The last thing you write is your final answer as: "Probability: ZZ%", 0-100 """ ) @@ -301,7 +218,6 @@ async def _binary_prompt_to_forecast( self, question: BinaryQuestion, prompt: str, - double_check_extraction: bool = False, ) -> ReasonedPrediction[float]: reasoning = await self.get_llm("default", "llm").invoke(prompt) logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") @@ -311,14 +227,6 @@ async def _binary_prompt_to_forecast( model=self.get_llm("parser", "llm"), num_validation_samples=self._structure_output_validation_samples, ) - if double_check_extraction: - redundant_extraction = PredictionExtractor.extract_last_percentage_value( - reasoning - ) - assert ( - abs(redundant_extraction - binary_prediction.prediction_in_decimal) - < 0.001 - ), f"Redundant extraction {redundant_extraction} does not match binary prediction {binary_prediction.prediction_in_decimal}" decimal_pred = max(0.01, min(0.99, binary_prediction.prediction_in_decimal)) logger.info( @@ -326,6 +234,8 @@ async def _binary_prompt_to_forecast( ) return ReasonedPrediction(prediction_value=decimal_pred, reasoning=reasoning) + ##################################### MULTIPLE CHOICE QUESTIONS ##################################### + async def _run_forecast_on_multiple_choice( self, question: MultipleChoiceQuestion, research: str ) -> ReasonedPrediction[PredictedOptionList]: @@ -373,7 +283,6 @@ async def _multiple_choice_prompt_to_forecast( self, question: MultipleChoiceQuestion, prompt: str, - double_check_extraction: bool = False, ) -> ReasonedPrediction[PredictedOptionList]: parsing_instructions = clean_indents( f""" @@ -393,31 +302,6 @@ async def _multiple_choice_prompt_to_forecast( num_validation_samples=self._structure_output_validation_samples, additional_instructions=parsing_instructions, ) - if double_check_extraction: - redundant_extraction = ( - PredictionExtractor.extract_option_list_with_percentage_afterwards( - reasoning, question.options - ) - ) - for redundant_prediction in redundant_extraction.predicted_options: - matching_original_option = next( - ( - option - for option in predicted_option_list.predicted_options - if option.option_name == redundant_prediction.option_name - ), - None, - ) - assert ( - matching_original_option is not None - ), f"Matching original option not found for {redundant_prediction.option_name}" - assert ( - abs( - redundant_prediction.probability - - matching_original_option.probability - ) - < 0.001 - ), f"Redundant extraction {redundant_prediction.probability} does not match original option {matching_original_option.probability} for option {redundant_prediction.option_name}" logger.info( f"Forecasted URL {question.page_url} with prediction: {predicted_option_list}." @@ -426,6 +310,8 @@ async def _multiple_choice_prompt_to_forecast( prediction_value=predicted_option_list, reasoning=reasoning ) + ##################################### NUMERIC QUESTIONS ##################################### + async def _run_forecast_on_numeric( self, question: NumericQuestion, research: str ) -> ReasonedPrediction[NumericDistribution]: @@ -459,7 +345,7 @@ async def _run_forecast_on_numeric( Formatting Instructions: - Please notice the units requested and give your answer in these units (e.g. whether you represent a number as 1,000,000 or 1 million). - Never use scientific notation. - - Always start with a smaller number (more negative if negative) and then increase from there + - Always start with a smaller number (more negative if negative) and then increase from there. The value for percentile 10 should always be less than the value for percentile 20, and so on. Before answering you write: (a) The time left until the outcome to the question is known. @@ -474,12 +360,12 @@ async def _run_forecast_on_numeric( The last thing you write is your final answer as: " - Percentile 10: XX + Percentile 10: XX (lowest number value) Percentile 20: XX Percentile 40: XX Percentile 60: XX Percentile 80: XX - Percentile 90: XX + Percentile 90: XX (highest number value) " """ ) @@ -489,7 +375,6 @@ async def _numeric_prompt_to_forecast( self, question: NumericQuestion, prompt: str, - double_check_extraction: bool = False, ) -> ReasonedPrediction[NumericDistribution]: reasoning = await self.get_llm("default", "llm").invoke(prompt) logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") @@ -513,34 +398,14 @@ async def _numeric_prompt_to_forecast( additional_instructions=parsing_instructions, num_validation_samples=self._structure_output_validation_samples, ) - - if double_check_extraction: - redundant_extraction = PredictionExtractor.extract_numeric_distribution_from_list_of_percentile_number_and_probability( - reasoning, question - ) - for redundant_percentile in redundant_extraction.declared_percentiles: - matching_original_percentile = next( - ( - percentile - for percentile in percentile_list - if abs(percentile.percentile - redundant_percentile.percentile) - < 0.001 - ), - None, - ) - assert ( - matching_original_percentile is not None - ), f"Matching original percentile not found for {redundant_percentile.percentile}" - assert ( - abs(redundant_percentile.value - matching_original_percentile.value) - < 0.001 - ), f"Redundant extraction {redundant_percentile.value} does not match original percentile {matching_original_percentile.value} for percentile {redundant_percentile.percentile}" prediction = NumericDistribution.from_question(percentile_list, question) logger.info( f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." ) return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) + ##################################### DATE QUESTIONS ##################################### + async def _run_forecast_on_date( self, question: DateQuestion, research: str ) -> ReasonedPrediction[NumericDistribution]: @@ -604,7 +469,6 @@ async def _date_prompt_to_forecast( self, question: DateQuestion, prompt: str, - double_check_extraction: bool = False, ) -> ReasonedPrediction[NumericDistribution]: reasoning = await self.get_llm("default", "llm").invoke(prompt) logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") @@ -632,10 +496,6 @@ async def _date_prompt_to_forecast( ) for percentile in date_percentile_list ] - - if double_check_extraction: - raise ValueError("Double check extraction not supported for date questions") - prediction = NumericDistribution.from_question(percentile_list, question) logger.info( f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." @@ -673,6 +533,98 @@ def _create_upper_and_lower_bound_messages( lower_bound_message = f"The outcome can not be lower than {lower_bound_number} {unit_of_measure}." return upper_bound_message, lower_bound_message + ##################################### CONDITIONAL QUESTIONS ##################################### + + async def _run_forecast_on_conditional( + self, question: ConditionalQuestion, research: str + ) -> ReasonedPrediction[ConditionalPrediction]: + parent_info, full_research = await self._get_question_prediction_info( + question.parent, research, "parent" + ) + child_info, full_research = await self._get_question_prediction_info( + question.child, research, "child" + ) + yes_info, full_research = await self._get_question_prediction_info( + question.question_yes, full_research, "yes" + ) + no_info, full_research = await self._get_question_prediction_info( + question.question_no, full_research, "no" + ) + full_reasoning = clean_indents( + f""" + ## Parent Question Reasoning + {parent_info.reasoning} + ## Child Question Reasoning + {child_info.reasoning} + ## Yes Question Reasoning + {yes_info.reasoning} + ## No Question Reasoning + {no_info.reasoning} + """ + ) + full_prediction = ConditionalPrediction( + parent=parent_info.prediction_value, # type: ignore + child=child_info.prediction_value, # type: ignore + prediction_yes=yes_info.prediction_value, # type: ignore + prediction_no=no_info.prediction_value, # type: ignore + ) + return ReasonedPrediction( + reasoning=full_reasoning, prediction_value=full_prediction + ) + + async def _get_question_prediction_info( + self, question: MetaculusQuestion, research: str, question_type: str + ) -> tuple[ReasonedPrediction[PredictionTypes | PredictionAffirmed], str]: + from forecasting_tools.data_models.data_organizer import DataOrganizer + + previous_forecasts = question.previous_forecasts + if ( + question_type in ["parent", "child"] + and previous_forecasts + and question_type not in self.force_reforecast_in_conditional + ): + # TODO: add option to not affirm current parent/child forecasts, create new forecast + previous_forecast = previous_forecasts[-1] + current_utc_time = datetime.now(timezone.utc) + if ( + previous_forecast.timestamp_end is None + or previous_forecast.timestamp_end > current_utc_time + ): + assert isinstance(previous_forecast, PredictionTypes) + pretty_value = DataOrganizer.get_readable_prediction(previous_forecast) + prediction = ReasonedPrediction( + prediction_value=PredictionAffirmed(), + reasoning=f"Already existing forecast reaffirmed at {pretty_value}.", + ) + return (prediction, research) # type: ignore + info = await self._make_prediction(question, research) + full_research = self._add_reasoning_to_research(research, info, question_type) + return info, full_research # type: ignore + + def _add_reasoning_to_research( + self, + research: str, + reasoning: ReasonedPrediction[PredictionTypes], + question_type: str, + ) -> str: + from forecasting_tools.data_models.data_organizer import DataOrganizer + + question_type = question_type.title() + return clean_indents( + f""" + {research} + --- + ## {question_type} Question Information + You have previously forecasted the {question_type} Question to the value: {DataOrganizer.get_readable_prediction(reasoning.prediction_value)} + This is relevant information for your current forecast, but it is NOT your current forecast, but previous forecasting information that is relevant to your current forecast. + The reasoning for the {question_type} Question was as such: + ``` + {reasoning.reasoning} + ``` + This is absolutely essential: do NOT use this reasoning to re-forecast the {question_type} question. + """ + ) + def _get_conditional_disclaimer_if_necessary( self, question: MetaculusQuestion ) -> str: @@ -698,7 +650,7 @@ def _get_conditional_disclaimer_if_necessary( litellm_logger.propagate = False parser = argparse.ArgumentParser( - description="Run the Q1TemplateBot forecasting system" + description="Run the TemplateBot forecasting system" ) parser.add_argument( "--mode", @@ -730,13 +682,14 @@ def _get_conditional_disclaimer_if_necessary( # allowed_tries=2, # ), # "summarizer": "openai/gpt-4o-mini", - # "researcher": "asknews/deep-research/low", + # "researcher": "asknews/news-summaries", # "parser": "openai/gpt-4o-mini", # }, ) client = MetaculusClient() if run_mode == "tournament": + # You may want to change this to the specific tournament ID you want to forecast on seasonal_tournament_reports = asyncio.run( template_bot.forecast_on_tournament( client.CURRENT_AI_COMPETITION_ID, return_exceptions=True diff --git a/forecasting_tools/forecast_bots/template_bot.py b/forecasting_tools/forecast_bots/template_bot.py index b5e7393b..f76a699f 100644 --- a/forecasting_tools/forecast_bots/template_bot.py +++ b/forecasting_tools/forecast_bots/template_bot.py @@ -1,12 +1,12 @@ from forecasting_tools.forecast_bots.official_bots.research_only_bot_2025_fall import ( FallResearchOnlyBot2025, ) -from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( - FallTemplateBot2025, +from forecasting_tools.forecast_bots.official_bots.template_bot_2026_spring import ( + SpringTemplateBot2026, ) -class TemplateBot(FallTemplateBot2025): +class TemplateBot(SpringTemplateBot2026): pass diff --git a/forecasting_tools/helpers/metaculus_client.py b/forecasting_tools/helpers/metaculus_client.py index 669cb02b..771ee6b8 100644 --- a/forecasting_tools/helpers/metaculus_client.py +++ b/forecasting_tools/helpers/metaculus_client.py @@ -108,6 +108,7 @@ class MetaculusClient: AI_COMPETITION_ID_Q1 = 32627 # https://www.metaculus.com/tournament/aibq1/ AI_COMPETITION_ID_Q2 = 32721 # https://www.metaculus.com/tournament/aibq2/ AIB_FALL_2025_ID = 32813 # https://www.metaculus.com/tournament/fall-aib-2025/ + AIB_SPRING_2026_ID = 32916 PRO_COMPARISON_TOURNAMENT_Q1 = 32631 PRO_COMPARISON_TOURNAMENT_Q2 = ( 32761 # https://www.metaculus.com/tournament/pro-benchmark-q22025 @@ -118,6 +119,7 @@ class MetaculusClient: Q1_2025_QUARTERLY_CUP = 32630 METACULUS_CUP_2025_1_ID = 32726 # Summer cup 2025 METACULUS_CUP_FALL_2025_ID = 32828 + METACULUS_CUP_SPRING_2026_ID = None AI_2027_TOURNAMENT_ID = "ai-2027" MAIN_FEED = 144 # site_main @@ -126,7 +128,7 @@ class MetaculusClient: CURRENT_METACULUS_CUP_ID = METACULUS_CUP_FALL_2025_ID CURRENT_QUARTERLY_CUP_ID = CURRENT_METACULUS_CUP_ID # Consider this parameter deprecated since quarterly cup is no longer active - CURRENT_AI_COMPETITION_ID = AIB_FALL_2025_ID + CURRENT_AI_COMPETITION_ID = AIB_SPRING_2026_ID CURRENT_MINIBENCH_ID = "minibench" CURRENT_MARKET_PULSE_ID = Q4_2025_MARKET_PULSE_ID diff --git a/pyproject.toml b/pyproject.toml index 94aa43e6..c6327418 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "forecasting-tools" -version = "0.2.79" +version = "0.2.80" description = "AI forecasting and research tools to help humans reason about and forecast the future" authors = ["Benjamin Wilson "] license = "MIT" From 5db2071f976e1ca67dfc1c2e836975987e98d74a Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 07:17:31 +0000 Subject: [PATCH 3/7] Updates to spring bot --- forecasting_tools/forecast_bots/forecast_bot.py | 2 ++ .../forecast_bots/official_bots/template_bot_2026_spring.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/forecasting_tools/forecast_bots/forecast_bot.py b/forecasting_tools/forecast_bots/forecast_bot.py index c0cc42f5..d2bbcea2 100644 --- a/forecasting_tools/forecast_bots/forecast_bot.py +++ b/forecasting_tools/forecast_bots/forecast_bot.py @@ -987,6 +987,8 @@ def _llm_config_defaults(cls) -> dict[str, str | GeneralLlm | None]: if os.getenv("ASKNEWS_CLIENT_ID") and os.getenv("ASKNEWS_SECRET"): researcher = "asknews/news-summaries" + elif os.getenv("ASKNEWS_API_KEY"): + researcher = "asknews/news-summaries" elif os.getenv("PERPLEXITY_API_KEY"): researcher = GeneralLlm(model="perplexity/sonar-pro", temperature=0.1) elif os.getenv("OPENROUTER_API_KEY"): diff --git a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py index 16626004..9893aea5 100644 --- a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py +++ b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py @@ -79,7 +79,7 @@ class SpringTemplateBot2026(ForecastBot): ... llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you "default": GeneralLlm( - model="openrouter/openai/gpt-4o", # "anthropic/claude-3-5-sonnet-20241022", etc (see docs for litellm) + model="openrouter/openai/gpt-4o", # "anthropic/claude-sonnet-4-20250514", etc (see docs for litellm) temperature=0.3, timeout=40, allowed_tries=2, @@ -676,7 +676,7 @@ def _get_conditional_disclaimer_if_necessary( skip_previously_forecasted_questions=True, # llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you # "default": GeneralLlm( - # model="openrouter/openai/gpt-4o", # "anthropic/claude-3-5-sonnet-20241022", etc (see docs for litellm) + # model="openrouter/openai/gpt-4o", # "anthropic/claude-sonnet-4-20250514", etc (see docs for litellm) # temperature=0.3, # timeout=40, # allowed_tries=2, From 3a198559171f97f82a48be932102d78127ff63f6 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 07:36:27 +0000 Subject: [PATCH 4/7] Updated exports and reorganized metaculus client --- forecasting_tools/__init__.py | 13 ++ forecasting_tools/helpers/metaculus_client.py | 214 +++++++++--------- 2 files changed, 120 insertions(+), 107 deletions(-) diff --git a/forecasting_tools/__init__.py b/forecasting_tools/__init__.py index 23c64448..f325a0c6 100644 --- a/forecasting_tools/__init__.py +++ b/forecasting_tools/__init__.py @@ -130,6 +130,19 @@ NumericReport.model_rebuild() DiscreteReport.model_rebuild() DateReport.model_rebuild() +from forecasting_tools.data_models.conditional_models import ( + ConditionalPrediction as ConditionalPrediction, +) +from forecasting_tools.data_models.conditional_models import ( + PredictionAffirmed as PredictionAffirmed, +) +from forecasting_tools.data_models.data_organizer import ( + PredictionTypes as PredictionTypes, +) +from forecasting_tools.data_models.numeric_report import Percentile as Percentile +from forecasting_tools.data_models.questions import ( + ConditionalQuestion as ConditionalQuestion, +) from forecasting_tools.data_models.questions import QuestionState as QuestionState from forecasting_tools.forecast_bots.forecast_bot import ForecastBot as ForecastBot from forecasting_tools.forecast_bots.forecast_bot import Notepad as Notepad diff --git a/forecasting_tools/helpers/metaculus_client.py b/forecasting_tools/helpers/metaculus_client.py index 771ee6b8..7797b383 100644 --- a/forecasting_tools/helpers/metaculus_client.py +++ b/forecasting_tools/helpers/metaculus_client.py @@ -152,17 +152,6 @@ def __init__( self.sleep_time_between_requests_min = sleep_time_between_requests_min self.sleep_jitter_seconds = sleep_jitter_seconds - @retry_with_exponential_backoff() - def get_current_user_id(self): - self._sleep_between_requests() - response = requests.get( - f"{self.base_url}/users/me", - **self._get_auth_headers(), # type: ignore - ) - raise_for_status_with_additional_info(response) - content = json.loads(response.content) - return content["id"] - @retry_with_exponential_backoff() def post_question_comment( self, @@ -186,102 +175,6 @@ def post_question_comment( logger.info(f"Posted comment on post {post_id}") raise_for_status_with_additional_info(response) - @retry_with_exponential_backoff() - def post_question_link( - self, - question1_id: int, - question2_id: int, - direction: int, - strength: int, - link_type: str, - ) -> int: - """ - Posts a link between questions - :param question1_id - :param question2_id - :param direction: +1 for positive, -1 for negative - :param strength: 1 for low, 2 for medium, 5 for high - :param link_type: only supports "causal" for now - :return: id of the created link - """ - self._sleep_between_requests() - response = requests.post( - f"{self.base_url}/coherence/links/create/", - json={ - "question1_id": question1_id, - "question2_id": question2_id, - "direction": direction, - "strength": strength, - "type": link_type, - }, - **self._get_auth_headers(), # type: ignore - timeout=self.timeout, - ) - logger.info(f"Posted question link between {question1_id} and {question2_id}") - raise_for_status_with_additional_info(response) - content = json.loads(response.content) - return content["id"] - - @retry_with_exponential_backoff() - def get_links_for_question(self, question_id: int) -> List[DetailedCoherenceLink]: - """ - Returns all links associated with a specific question - direction is +1 for positive and -1 for negative - strength is 1 for low, 2 for medium and 5 for high - """ - self._sleep_between_requests() - response = requests.get( - f"{self.base_url}/coherence/question/{question_id}/links/", - **self._get_auth_headers(), # type: ignore - timeout=self.timeout, - ) - raise_for_status_with_additional_info(response) - content = json.loads(response.content)["data"] - links = [ - DetailedCoherenceLink.from_metaculus_api_json(link) for link in content - ] - return links - - @retry_with_exponential_backoff() - def delete_question_link(self, link_id: int): - self._sleep_between_requests() - response = requests.delete( - f"{self.base_url}/coherence/links/{link_id}/delete/", - **self._get_auth_headers(), # type: ignore - timeout=self.timeout, - ) - logger.info(f"Deleted question link with id {link_id}") - raise_for_status_with_additional_info(response) - - @retry_with_exponential_backoff() - def get_needs_update_questions( - self, - question_id: int, - last_datetime: datetime, - user_id_for_links: int | None = None, - ) -> NeedsUpdateResponse: - self._sleep_between_requests() - json_data: dict[str, Any] = {"datetime": last_datetime.isoformat()} - if user_id_for_links: - json_data["user_id_for_links"] = user_id_for_links - response = requests.get( - f"{self.base_url}/coherence/question/{question_id}/links/needs-update/", - **self._get_auth_headers(), # type: ignore - timeout=self.timeout, - json=json_data, - ) - raise_for_status_with_additional_info(response) - content = json.loads(response.content) - questions = [ - DataOrganizer.get_question_from_question_json(json_q) - for json_q in content["questions"] - ] - links = [ - CoherenceLink.model_validate(json_link) for json_link in content["links"] - ] - result = NeedsUpdateResponse(questions=questions, links=links) - return result - def post_binary_question_prediction( self, question_id: int, prediction_in_decimal: float ) -> None: @@ -506,6 +399,113 @@ def get_benchmark_questions( questions = typeguard.check_type(questions, list[BinaryQuestion]) return questions + @retry_with_exponential_backoff() + def get_current_user_id(self): + self._sleep_between_requests() + response = requests.get( + f"{self.base_url}/users/me", + **self._get_auth_headers(), # type: ignore + ) + raise_for_status_with_additional_info(response) + content = json.loads(response.content) + return content["id"] + + @retry_with_exponential_backoff() + def post_question_link( + self, + question1_id: int, + question2_id: int, + direction: int, + strength: int, + link_type: str, + ) -> int: + """ + Posts a link between questions + :param question1_id + :param question2_id + :param direction: +1 for positive, -1 for negative + :param strength: 1 for low, 2 for medium, 5 for high + :param link_type: only supports "causal" for now + :return: id of the created link + """ + self._sleep_between_requests() + response = requests.post( + f"{self.base_url}/coherence/links/create/", + json={ + "question1_id": question1_id, + "question2_id": question2_id, + "direction": direction, + "strength": strength, + "type": link_type, + }, + **self._get_auth_headers(), # type: ignore + timeout=self.timeout, + ) + logger.info(f"Posted question link between {question1_id} and {question2_id}") + raise_for_status_with_additional_info(response) + content = json.loads(response.content) + return content["id"] + + @retry_with_exponential_backoff() + def get_links_for_question(self, question_id: int) -> List[DetailedCoherenceLink]: + """ + Returns all links associated with a specific question + direction is +1 for positive and -1 for negative + strength is 1 for low, 2 for medium and 5 for high + """ + self._sleep_between_requests() + response = requests.get( + f"{self.base_url}/coherence/question/{question_id}/links/", + **self._get_auth_headers(), # type: ignore + timeout=self.timeout, + ) + raise_for_status_with_additional_info(response) + content = json.loads(response.content)["data"] + links = [ + DetailedCoherenceLink.from_metaculus_api_json(link) for link in content + ] + return links + + @retry_with_exponential_backoff() + def delete_question_link(self, link_id: int): + self._sleep_between_requests() + response = requests.delete( + f"{self.base_url}/coherence/links/{link_id}/delete/", + **self._get_auth_headers(), # type: ignore + timeout=self.timeout, + ) + logger.info(f"Deleted question link with id {link_id}") + raise_for_status_with_additional_info(response) + + @retry_with_exponential_backoff() + def get_needs_update_questions( + self, + question_id: int, + last_datetime: datetime, + user_id_for_links: int | None = None, + ) -> NeedsUpdateResponse: + self._sleep_between_requests() + json_data: dict[str, Any] = {"datetime": last_datetime.isoformat()} + if user_id_for_links: + json_data["user_id_for_links"] = user_id_for_links + response = requests.get( + f"{self.base_url}/coherence/question/{question_id}/links/needs-update/", + **self._get_auth_headers(), # type: ignore + timeout=self.timeout, + json=json_data, + ) + raise_for_status_with_additional_info(response) + content = json.loads(response.content) + questions = [ + DataOrganizer.get_question_from_question_json(json_q) + for json_q in content["questions"] + ] + links = [ + CoherenceLink.model_validate(json_link) for json_link in content["links"] + ] + result = NeedsUpdateResponse(questions=questions, links=links) + return result + def _get_auth_headers(self) -> dict[str, dict[str, str]]: METACULUS_TOKEN = os.getenv("METACULUS_TOKEN") if METACULUS_TOKEN is None: From ccce4855b86c05ebc1c2e0e61285175142e77279 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Wed, 17 Dec 2025 15:32:56 +0000 Subject: [PATCH 5/7] Added metadata line --- .../forecast_bots/official_bots/template_bot_2026_spring.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py index 9893aea5..12b9ff0b 100644 --- a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py +++ b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py @@ -590,8 +590,7 @@ async def _get_question_prediction_info( previous_forecast.timestamp_end is None or previous_forecast.timestamp_end > current_utc_time ): - assert isinstance(previous_forecast, PredictionTypes) - pretty_value = DataOrganizer.get_readable_prediction(previous_forecast) + pretty_value = DataOrganizer.get_readable_prediction(previous_forecast) # type: ignore prediction = ReasonedPrediction( prediction_value=PredictionAffirmed(), reasoning=f"Already existing forecast reaffirmed at {pretty_value}.", @@ -674,6 +673,7 @@ def _get_conditional_disclaimer_if_necessary( publish_reports_to_metaculus=True, folder_to_save_reports_to=None, skip_previously_forecasted_questions=True, + extra_metadata_in_explanation=True, # llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you # "default": GeneralLlm( # model="openrouter/openai/gpt-4o", # "anthropic/claude-sonnet-4-20250514", etc (see docs for litellm) From d978e631559c36a30cf5dd059238a6b74bd58114 Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Fri, 19 Dec 2025 03:36:36 +0000 Subject: [PATCH 6/7] Changed default inbound outcome count --- forecasting_tools/data_models/numeric_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forecasting_tools/data_models/numeric_report.py b/forecasting_tools/data_models/numeric_report.py index 1ef7d876..9e18aca6 100644 --- a/forecasting_tools/data_models/numeric_report.py +++ b/forecasting_tools/data_models/numeric_report.py @@ -22,10 +22,10 @@ class NumericDefaults: - DEFAULT_INBOUND_OUTCOME_COUNT = 200 DEFAULT_CDF_SIZE = ( 201 # Discrete questions have fewer points, Numeric will have 201 points ) + DEFAULT_INBOUND_OUTCOME_COUNT = DEFAULT_CDF_SIZE - 1 MAX_NUMERIC_PMF_VALUE = 0.2 @classmethod From f7620650668fd40a70127664baf4ade45c23ad3c Mon Sep 17 00:00:00 2001 From: Ben Wilson Date: Fri, 19 Dec 2025 03:48:19 +0000 Subject: [PATCH 7/7] Incporated ai code review --- .../official_bots/template_bot_2026_spring.py | 8 ++++---- forecasting_tools/helpers/metaculus_client.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py index 12b9ff0b..5f5f02c1 100644 --- a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py +++ b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py @@ -45,7 +45,7 @@ class SpringTemplateBot2026(ForecastBot): So far our track record has been 1 mentionable bug per season (affecting forecasts for 1-2% of total questions) Main changes since Fall: - - Additional prompting has been added to numeric questions to emphasize putting pecentile values in the correct order. + - Additional prompting has been added to numeric questions to emphasize putting percentile values in the correct order. - Support for conditional and date questions has been added - Note: Spring AIB will not use date/conditional questions, so these are only for forecasting on the main site as you wish. @@ -93,7 +93,7 @@ class SpringTemplateBot2026(ForecastBot): Then you can access the model in custom functions like this: ```python - research_strategy = self.get_llm("researcher", "model_name" + research_strategy = self.get_llm("researcher", "model_name") if research_strategy == "asknews/news-summaries": ... # OR @@ -265,7 +265,7 @@ async def _run_forecast_on_multiple_choice( Before answering you write: (a) The time left until the outcome to the question is known. (b) The status quo outcome if nothing changed. - (c) A description of an scenario that results in an unexpected outcome. + (c) A description of a scenario that results in an unexpected outcome. {self._get_conditional_disclaimer_if_necessary(question)} You write your rationale remembering that (1) good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time, and (2) good forecasters leave some moderate probability on most options to account for unexpected outcomes. @@ -436,7 +436,7 @@ async def _run_forecast_on_date( Formatting Instructions: - This is a date question, and as such, the answer must be expressed in terms of dates. - - The dates must be written in the format of YYYY-MM-DD. If hours matter, please append the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ.No other formatting is allowed. + - The dates must be written in the format of YYYY-MM-DD. If hours matter, please append the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ. No other formatting is allowed. - Always start with a lower date chronologically and then increase from there. - Do NOT forget this. The dates must be written in chronological order starting at the earliest time at percentile 10 and increasing from there. diff --git a/forecasting_tools/helpers/metaculus_client.py b/forecasting_tools/helpers/metaculus_client.py index 7797b383..96c488bc 100644 --- a/forecasting_tools/helpers/metaculus_client.py +++ b/forecasting_tools/helpers/metaculus_client.py @@ -400,7 +400,7 @@ def get_benchmark_questions( return questions @retry_with_exponential_backoff() - def get_current_user_id(self): + def get_current_user_id(self) -> int: self._sleep_between_requests() response = requests.get( f"{self.base_url}/users/me", @@ -408,7 +408,7 @@ def get_current_user_id(self): ) raise_for_status_with_additional_info(response) content = json.loads(response.content) - return content["id"] + return int(content["id"]) @retry_with_exponential_backoff() def post_question_link(