diff --git a/forecasting_tools/__init__.py b/forecasting_tools/__init__.py index 92b7393e..f325a0c6 100644 --- a/forecasting_tools/__init__.py +++ b/forecasting_tools/__init__.py @@ -130,16 +130,23 @@ NumericReport.model_rebuild() DiscreteReport.model_rebuild() DateReport.model_rebuild() +from forecasting_tools.data_models.conditional_models import ( + ConditionalPrediction as ConditionalPrediction, +) +from forecasting_tools.data_models.conditional_models import ( + PredictionAffirmed as PredictionAffirmed, +) +from forecasting_tools.data_models.data_organizer import ( + PredictionTypes as PredictionTypes, +) +from forecasting_tools.data_models.numeric_report import Percentile as Percentile +from forecasting_tools.data_models.questions import ( + ConditionalQuestion as ConditionalQuestion, +) from forecasting_tools.data_models.questions import QuestionState as QuestionState from forecasting_tools.forecast_bots.forecast_bot import ForecastBot as ForecastBot from forecasting_tools.forecast_bots.forecast_bot import Notepad as Notepad from forecasting_tools.forecast_bots.main_bot import MainBot as MainBot -from forecasting_tools.forecast_bots.official_bots.fall_research_only_bot import ( - FallResearchOnlyBot2025 as FallResearchOnlyBot2025, -) -from forecasting_tools.forecast_bots.official_bots.fall_template_bot import ( - FallTemplateBot2025 as FallTemplateBot2025, -) from forecasting_tools.forecast_bots.official_bots.gpt_4_1_optimized_bot import ( GPT41OptimizedBot as GPT41OptimizedBot, ) @@ -155,6 +162,15 @@ from forecasting_tools.forecast_bots.official_bots.q4_template_bot import ( Q4TemplateBot2024 as Q4TemplateBot2024, ) +from forecasting_tools.forecast_bots.official_bots.research_only_bot_2025_fall import ( + FallResearchOnlyBot2025 as FallResearchOnlyBot2025, +) +from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( + FallTemplateBot2025 as FallTemplateBot2025, +) +from forecasting_tools.forecast_bots.official_bots.template_bot_2026_spring import ( + SpringTemplateBot2026 as SpringTemplateBot2026, +) from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot as UniformProbabilityBot, ) diff --git a/forecasting_tools/data_models/numeric_report.py b/forecasting_tools/data_models/numeric_report.py index 1ef7d876..9e18aca6 100644 --- a/forecasting_tools/data_models/numeric_report.py +++ b/forecasting_tools/data_models/numeric_report.py @@ -22,10 +22,10 @@ class NumericDefaults: - DEFAULT_INBOUND_OUTCOME_COUNT = 200 DEFAULT_CDF_SIZE = ( 201 # Discrete questions have fewer points, Numeric will have 201 points ) + DEFAULT_INBOUND_OUTCOME_COUNT = DEFAULT_CDF_SIZE - 1 MAX_NUMERIC_PMF_VALUE = 0.2 @classmethod diff --git a/forecasting_tools/forecast_bots/bot_lists.py b/forecasting_tools/forecast_bots/bot_lists.py index 87141c0e..c1e73f65 100644 --- a/forecasting_tools/forecast_bots/bot_lists.py +++ b/forecasting_tools/forecast_bots/bot_lists.py @@ -8,12 +8,6 @@ from forecasting_tools.forecast_bots.experiments.q4_veritas_bot import Q4VeritasBot from forecasting_tools.forecast_bots.forecast_bot import ForecastBot from forecasting_tools.forecast_bots.main_bot import MainBot -from forecasting_tools.forecast_bots.official_bots.fall_research_only_bot import ( - FallResearchOnlyBot2025, -) -from forecasting_tools.forecast_bots.official_bots.fall_template_bot import ( - FallTemplateBot2025, -) from forecasting_tools.forecast_bots.official_bots.gpt_4_1_optimized_bot import ( GPT41OptimizedBot, ) @@ -29,6 +23,15 @@ from forecasting_tools.forecast_bots.official_bots.q4_template_bot import ( Q4TemplateBot2024, ) +from forecasting_tools.forecast_bots.official_bots.research_only_bot_2025_fall import ( + FallResearchOnlyBot2025, +) +from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( + FallTemplateBot2025, +) +from forecasting_tools.forecast_bots.official_bots.template_bot_2026_spring import ( + SpringTemplateBot2026, +) from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot, ) @@ -51,6 +54,7 @@ def get_all_important_bot_classes() -> list[type[ForecastBot]]: Q2TemplateBotWithDecompositionV1, Q2TemplateBotWithDecompositionV2, FallResearchOnlyBot2025, + SpringTemplateBot2026, GPT41OptimizedBot, ] diff --git a/forecasting_tools/forecast_bots/forecast_bot.py b/forecasting_tools/forecast_bots/forecast_bot.py index c0cc42f5..d2bbcea2 100644 --- a/forecasting_tools/forecast_bots/forecast_bot.py +++ b/forecasting_tools/forecast_bots/forecast_bot.py @@ -987,6 +987,8 @@ def _llm_config_defaults(cls) -> dict[str, str | GeneralLlm | None]: if os.getenv("ASKNEWS_CLIENT_ID") and os.getenv("ASKNEWS_SECRET"): researcher = "asknews/news-summaries" + elif os.getenv("ASKNEWS_API_KEY"): + researcher = "asknews/news-summaries" elif os.getenv("PERPLEXITY_API_KEY"): researcher = GeneralLlm(model="perplexity/sonar-pro", temperature=0.1) elif os.getenv("OPENROUTER_API_KEY"): diff --git a/forecasting_tools/forecast_bots/official_bots/gpt_4_1_optimized_bot.py b/forecasting_tools/forecast_bots/official_bots/gpt_4_1_optimized_bot.py index 27ad3184..4ef4b31c 100644 --- a/forecasting_tools/forecast_bots/official_bots/gpt_4_1_optimized_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/gpt_4_1_optimized_bot.py @@ -4,7 +4,7 @@ from forecasting_tools.data_models.forecast_report import ReasonedPrediction from forecasting_tools.data_models.questions import BinaryQuestion, MetaculusQuestion -from forecasting_tools.forecast_bots.official_bots.fall_template_bot import ( +from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( FallTemplateBot2025, ) from forecasting_tools.helpers.asknews_searcher import AskNewsSearcher diff --git a/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py b/forecasting_tools/forecast_bots/official_bots/research_only_bot_2025_fall.py similarity index 99% rename from forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py rename to forecasting_tools/forecast_bots/official_bots/research_only_bot_2025_fall.py index 0ce2dadd..f3893a28 100644 --- a/forecasting_tools/forecast_bots/official_bots/fall_research_only_bot.py +++ b/forecasting_tools/forecast_bots/official_bots/research_only_bot_2025_fall.py @@ -13,7 +13,7 @@ MultipleChoiceQuestion, NumericQuestion, ) -from forecasting_tools.forecast_bots.official_bots.fall_template_bot import ( +from forecasting_tools.forecast_bots.official_bots.template_bot_2025_fall import ( FallTemplateBot2025, ) from forecasting_tools.util.misc import clean_indents diff --git a/forecasting_tools/forecast_bots/official_bots/fall_template_bot.py b/forecasting_tools/forecast_bots/official_bots/template_bot_2025_fall.py similarity index 100% rename from forecasting_tools/forecast_bots/official_bots/fall_template_bot.py rename to forecasting_tools/forecast_bots/official_bots/template_bot_2025_fall.py diff --git a/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py new file mode 100644 index 00000000..5f5f02c1 --- /dev/null +++ b/forecasting_tools/forecast_bots/official_bots/template_bot_2026_spring.py @@ -0,0 +1,729 @@ +import argparse +import asyncio +import logging +from datetime import datetime, timezone +from typing import Literal + +from forecasting_tools.agents_and_tools.research.smart_searcher import SmartSearcher +from forecasting_tools.ai_models.general_llm import GeneralLlm +from forecasting_tools.data_models.binary_report import BinaryPrediction +from forecasting_tools.data_models.conditional_models import ( + ConditionalPrediction, + PredictionAffirmed, +) +from forecasting_tools.data_models.data_organizer import PredictionTypes +from forecasting_tools.data_models.forecast_report import ReasonedPrediction +from forecasting_tools.data_models.multiple_choice_report import PredictedOptionList +from forecasting_tools.data_models.numeric_report import ( + DatePercentile, + NumericDistribution, + Percentile, +) +from forecasting_tools.data_models.questions import ( + BinaryQuestion, + ConditionalQuestion, + DateQuestion, + MetaculusQuestion, + MultipleChoiceQuestion, + NumericQuestion, +) +from forecasting_tools.forecast_bots.forecast_bot import ForecastBot +from forecasting_tools.helpers.asknews_searcher import AskNewsSearcher +from forecasting_tools.helpers.metaculus_client import MetaculusClient +from forecasting_tools.helpers.structure_output import structure_output +from forecasting_tools.util.misc import clean_indents + +logger = logging.getLogger(__name__) + + +class SpringTemplateBot2026(ForecastBot): + """ + This is the template bot for Spring 2026 Metaculus AI Tournament. + This is a copy of what is used by Metaculus to run the Metac Bots in our benchmark, provided as a template for new bot makers. + This template is given as-is, and is use-at-your-own-risk. + We have covered most test cases in forecasting-tools it may be worth double checking key components locally. + So far our track record has been 1 mentionable bug per season (affecting forecasts for 1-2% of total questions) + + Main changes since Fall: + - Additional prompting has been added to numeric questions to emphasize putting percentile values in the correct order. + - Support for conditional and date questions has been added + - Note: Spring AIB will not use date/conditional questions, so these are only for forecasting on the main site as you wish. + + The main entry point of this bot is `bot.forecast_on_tournament(tournament_id)` in the parent class. + See the script at the bottom of the file for more details on how to run the bot. + Ignoring the finer details, the general flow is: + - Load questions from Metaculus + - For each question + - Execute run_research a number of times equal to research_reports_per_question + - Execute respective run_forecast function `predictions_per_research_report * research_reports_per_question` times + - Aggregate the predictions + - Submit prediction (if publish_reports_to_metaculus is True) + - Return a list of ForecastReport objects + + Alternatively, you can use the MetaculusClient to make a custom filter of questions to forecast on + and forecast them with `bot.forecast_questions(questions)` + + Only the research and forecast functions need to be implemented in ForecastBot subclasses, + though you may want to override other ForecastBot functions. + In this example, you can change the prompts to be whatever you want since, + structure_output uses an LLM to intelligently reformat the output into the needed structure. + + By default (i.e. 'tournament' mode), when you run this script, it will forecast on any open questions in the + primary bot tournament and MiniBench. If you want to forecast on only one or the other, you can remove one + of them from the 'tournament' mode code at the bottom of the file. + + You can experiment with what models work best with your bot by using the `llms` parameter when initializing the bot. + You can initialize the bot with any number of models. For example, + ```python + my_bot = MyBot( + ... + llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you + "default": GeneralLlm( + model="openrouter/openai/gpt-4o", # "anthropic/claude-sonnet-4-20250514", etc (see docs for litellm) + temperature=0.3, + timeout=40, + allowed_tries=2, + ), + "summarizer": "openai/gpt-4o-mini", + "researcher": "asknews/news-summaries", + "parser": "openai/gpt-4o-mini", + }, + ) + ``` + + Then you can access the model in custom functions like this: + ```python + research_strategy = self.get_llm("researcher", "model_name") + if research_strategy == "asknews/news-summaries": + ... + # OR + summarizer = await self.get_llm("summarizer", "llm").invoke(prompt) + # OR + reasoning = await self.get_llm("default", "llm").invoke(prompt) + ``` + + If you end up having trouble with rate limits and want to try a more sophisticated rate limiter try: + ```python + from forecasting_tools import RefreshingBucketRateLimiter + rate_limiter = RefreshingBucketRateLimiter( + capacity=2, + refresh_rate=1, + ) # Allows 1 request per second on average with a burst of 2 requests initially. Set this as a class variable + await self.rate_limiter.wait_till_able_to_acquire_resources(1) # 1 because it's consuming 1 request (use more if you are adding a token limit) + ``` + Additionally OpenRouter has large rate limits immediately on account creation + """ + + _max_concurrent_questions = ( + 1 # Set this to whatever works for your search-provider/ai-model rate limits + ) + _concurrency_limiter = asyncio.Semaphore(_max_concurrent_questions) + _structure_output_validation_samples = 2 + + ##################################### RESEARCH ##################################### + + async def run_research(self, question: MetaculusQuestion) -> str: + async with self._concurrency_limiter: + research = "" + researcher = self.get_llm("researcher") + + prompt = clean_indents( + f""" + You are an assistant to a superforecaster. + The superforecaster will give you a question they intend to forecast on. + To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information. + You do not produce forecasts yourself. + + Question: + {question.question_text} + + This question's outcome will be determined by the specific criteria below: + {question.resolution_criteria} + + {question.fine_print} + """ + ) + + if isinstance(researcher, GeneralLlm): + research = await researcher.invoke(prompt) + elif ( + researcher == "asknews/news-summaries" + or researcher == "asknews/deep-research/low-depth" + or researcher == "asknews/deep-research/medium-depth" + or researcher == "asknews/deep-research/high-depth" + ): + research = await AskNewsSearcher().call_preconfigured_version( + researcher, prompt + ) + elif researcher.startswith("smart-searcher"): + model_name = researcher.removeprefix("smart-searcher/") + searcher = SmartSearcher( + model=model_name, + temperature=0, + num_searches_to_run=2, + num_sites_per_search=10, + use_advanced_filters=False, + ) + research = await searcher.invoke(prompt) + elif not researcher or researcher == "None" or researcher == "no_research": + research = "" + else: + research = await self.get_llm("researcher", "llm").invoke(prompt) + logger.info(f"Found Research for URL {question.page_url}:\n{research}") + return research + + ##################################### BINARY QUESTIONS ##################################### + + async def _run_forecast_on_binary( + self, question: BinaryQuestion, research: str + ) -> ReasonedPrediction[float]: + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + Question background: + {question.background_info} + + + This question's outcome will be determined by the specific criteria below. These criteria have not yet been satisfied: + {question.resolution_criteria} + + {question.fine_print} + + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + Before answering you write: + (a) The time left until the outcome to the question is known. + (b) The status quo outcome if nothing changed. + (c) A brief description of a scenario that results in a No outcome. + (d) A brief description of a scenario that results in a Yes outcome. + + You write your rationale remembering that good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time. + {self._get_conditional_disclaimer_if_necessary(question)} + + The last thing you write is your final answer as: "Probability: ZZ%", 0-100 + """ + ) + + return await self._binary_prompt_to_forecast(question, prompt) + + async def _binary_prompt_to_forecast( + self, + question: BinaryQuestion, + prompt: str, + ) -> ReasonedPrediction[float]: + reasoning = await self.get_llm("default", "llm").invoke(prompt) + logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + binary_prediction: BinaryPrediction = await structure_output( + reasoning, + BinaryPrediction, + model=self.get_llm("parser", "llm"), + num_validation_samples=self._structure_output_validation_samples, + ) + decimal_pred = max(0.01, min(0.99, binary_prediction.prediction_in_decimal)) + + logger.info( + f"Forecasted URL {question.page_url} with prediction: {decimal_pred}." + ) + return ReasonedPrediction(prediction_value=decimal_pred, reasoning=reasoning) + + ##################################### MULTIPLE CHOICE QUESTIONS ##################################### + + async def _run_forecast_on_multiple_choice( + self, question: MultipleChoiceQuestion, research: str + ) -> ReasonedPrediction[PredictedOptionList]: + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + The options are: {question.options} + + + Background: + {question.background_info} + + {question.resolution_criteria} + + {question.fine_print} + + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + Before answering you write: + (a) The time left until the outcome to the question is known. + (b) The status quo outcome if nothing changed. + (c) A description of a scenario that results in an unexpected outcome. + + {self._get_conditional_disclaimer_if_necessary(question)} + You write your rationale remembering that (1) good forecasters put extra weight on the status quo outcome since the world changes slowly most of the time, and (2) good forecasters leave some moderate probability on most options to account for unexpected outcomes. + + The last thing you write is your final probabilities for the N options in this order {question.options} as: + Option_A: Probability_A + Option_B: Probability_B + ... + Option_N: Probability_N + """ + ) + return await self._multiple_choice_prompt_to_forecast(question, prompt) + + async def _multiple_choice_prompt_to_forecast( + self, + question: MultipleChoiceQuestion, + prompt: str, + ) -> ReasonedPrediction[PredictedOptionList]: + parsing_instructions = clean_indents( + f""" + Make sure that all option names are one of the following: + {question.options} + + The text you are parsing may prepend these options with some variation of "Option" which you should remove if not part of the option names I just gave you. + Additionally, you may sometimes need to parse a 0% probability. Please do not skip options with 0% but rather make it an entry in your final list with 0% probability. + """ + ) + reasoning = await self.get_llm("default", "llm").invoke(prompt) + logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + predicted_option_list: PredictedOptionList = await structure_output( + text_to_structure=reasoning, + output_type=PredictedOptionList, + model=self.get_llm("parser", "llm"), + num_validation_samples=self._structure_output_validation_samples, + additional_instructions=parsing_instructions, + ) + + logger.info( + f"Forecasted URL {question.page_url} with prediction: {predicted_option_list}." + ) + return ReasonedPrediction( + prediction_value=predicted_option_list, reasoning=reasoning + ) + + ##################################### NUMERIC QUESTIONS ##################################### + + async def _run_forecast_on_numeric( + self, question: NumericQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + Background: + {question.background_info} + + {question.resolution_criteria} + + {question.fine_print} + + Units for answer: {question.unit_of_measure if question.unit_of_measure else "Not stated (please infer this)"} + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + {lower_bound_message} + {upper_bound_message} + + Formatting Instructions: + - Please notice the units requested and give your answer in these units (e.g. whether you represent a number as 1,000,000 or 1 million). + - Never use scientific notation. + - Always start with a smaller number (more negative if negative) and then increase from there. The value for percentile 10 should always be less than the value for percentile 20, and so on. + + Before answering you write: + (a) The time left until the outcome to the question is known. + (b) The outcome if nothing changed. + (c) The outcome if the current trend continued. + (d) The expectations of experts and markets. + (e) A brief description of an unexpected scenario that results in a low outcome. + (f) A brief description of an unexpected scenario that results in a high outcome. + + {self._get_conditional_disclaimer_if_necessary(question)} + You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. + + The last thing you write is your final answer as: + " + Percentile 10: XX (lowest number value) + Percentile 20: XX + Percentile 40: XX + Percentile 60: XX + Percentile 80: XX + Percentile 90: XX (highest number value) + " + """ + ) + return await self._numeric_prompt_to_forecast(question, prompt) + + async def _numeric_prompt_to_forecast( + self, + question: NumericQuestion, + prompt: str, + ) -> ReasonedPrediction[NumericDistribution]: + reasoning = await self.get_llm("default", "llm").invoke(prompt) + logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + parsing_instructions = clean_indents( + f""" + The text given to you is trying to give a forecast distribution for a numeric question. + - This text is trying to answer the numeric question: "{question.question_text}". + - When parsing the text, please make sure to give the values (the ones assigned to percentiles) in terms of the correct units. + - The units for the forecast are: {question.unit_of_measure} + - Your work will be shown publicly with these units stated verbatim after the numbers your parse. + - As an example, someone else guessed that the answer will be between {question.lower_bound} {question.unit_of_measure} and {question.upper_bound} {question.unit_of_measure}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". + - If the answer doesn't give the answer in the correct units, you should parse it in the right units. For instance if the answer gives numbers as $500,000,000 and units are "B $" then you should parse the answer as 0.5 (since $500,000,000 is $0.5 billion). + - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. + - Turn any values that are in scientific notation into regular numbers. + """ + ) + percentile_list: list[Percentile] = await structure_output( + reasoning, + list[Percentile], + model=self.get_llm("parser", "llm"), + additional_instructions=parsing_instructions, + num_validation_samples=self._structure_output_validation_samples, + ) + prediction = NumericDistribution.from_question(percentile_list, question) + logger.info( + f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." + ) + return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) + + ##################################### DATE QUESTIONS ##################################### + + async def _run_forecast_on_date( + self, question: DateQuestion, research: str + ) -> ReasonedPrediction[NumericDistribution]: + upper_bound_message, lower_bound_message = ( + self._create_upper_and_lower_bound_messages(question) + ) + prompt = clean_indents( + f""" + You are a professional forecaster interviewing for a job. + + Your interview question is: + {question.question_text} + + Background: + {question.background_info} + + {question.resolution_criteria} + + {question.fine_print} + + Your research assistant says: + {research} + + Today is {datetime.now().strftime("%Y-%m-%d")}. + + {lower_bound_message} + {upper_bound_message} + + Formatting Instructions: + - This is a date question, and as such, the answer must be expressed in terms of dates. + - The dates must be written in the format of YYYY-MM-DD. If hours matter, please append the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ. No other formatting is allowed. + - Always start with a lower date chronologically and then increase from there. + - Do NOT forget this. The dates must be written in chronological order starting at the earliest time at percentile 10 and increasing from there. + + Before answering you write: + (a) The time left until the outcome to the question is known. + (b) The outcome if nothing changed. + (c) The outcome if the current trend continued. + (d) The expectations of experts and markets. + (e) A brief description of an unexpected scenario that results in a low outcome. + (f) A brief description of an unexpected scenario that results in a high outcome. + + {self._get_conditional_disclaimer_if_necessary(question)} + You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns. + + The last thing you write is your final answer as: + " + Percentile 10: YYYY-MM-DD (oldest date) + Percentile 20: YYYY-MM-DD + Percentile 40: YYYY-MM-DD + Percentile 60: YYYY-MM-DD + Percentile 80: YYYY-MM-DD + Percentile 90: YYYY-MM-DD (newest date) + " + """ + ) + forecast = await self._date_prompt_to_forecast(question, prompt) + return forecast + + async def _date_prompt_to_forecast( + self, + question: DateQuestion, + prompt: str, + ) -> ReasonedPrediction[NumericDistribution]: + reasoning = await self.get_llm("default", "llm").invoke(prompt) + logger.info(f"Reasoning for URL {question.page_url}: {reasoning}") + parsing_instructions = clean_indents( + f""" + The text given to you is trying to give a forecast distribution for a date question. + - This text is trying to answer the question: "{question.question_text}". + - As an example, someone else guessed that the answer will be between {question.lower_bound} and {question.upper_bound}, so the numbers parsed from an answer like this would be verbatim "{question.lower_bound}" and "{question.upper_bound}". + - The output is given as dates/times please format it into a valid datetime parsable string. Assume midnight UTC if no hour is given. + - If percentiles are not explicitly given (e.g. only a single value is given) please don't return a parsed output, but rather indicate that the answer is not explicitly given in the text. + """ + ) + date_percentile_list: list[DatePercentile] = await structure_output( + reasoning, + list[DatePercentile], + model=self.get_llm("parser", "llm"), + additional_instructions=parsing_instructions, + num_validation_samples=self._structure_output_validation_samples, + ) + + percentile_list = [ + Percentile( + percentile=percentile.percentile, + value=percentile.value.timestamp(), + ) + for percentile in date_percentile_list + ] + prediction = NumericDistribution.from_question(percentile_list, question) + logger.info( + f"Forecasted URL {question.page_url} with prediction: {prediction.declared_percentiles}." + ) + return ReasonedPrediction(prediction_value=prediction, reasoning=reasoning) + + def _create_upper_and_lower_bound_messages( + self, question: NumericQuestion | DateQuestion + ) -> tuple[str, str]: + if isinstance(question, NumericQuestion): + if question.nominal_upper_bound is not None: + upper_bound_number = question.nominal_upper_bound + else: + upper_bound_number = question.upper_bound + if question.nominal_lower_bound is not None: + lower_bound_number = question.nominal_lower_bound + else: + lower_bound_number = question.lower_bound + unit_of_measure = question.unit_of_measure + elif isinstance(question, DateQuestion): + upper_bound_number = question.upper_bound.date().isoformat() + lower_bound_number = question.lower_bound.date().isoformat() + unit_of_measure = "" + else: + raise ValueError() + + if question.open_upper_bound: + upper_bound_message = f"The question creator thinks the number is likely not higher than {upper_bound_number} {unit_of_measure}." + else: + upper_bound_message = f"The outcome can not be higher than {upper_bound_number} {unit_of_measure}." + + if question.open_lower_bound: + lower_bound_message = f"The question creator thinks the number is likely not lower than {lower_bound_number} {unit_of_measure}." + else: + lower_bound_message = f"The outcome can not be lower than {lower_bound_number} {unit_of_measure}." + return upper_bound_message, lower_bound_message + + ##################################### CONDITIONAL QUESTIONS ##################################### + + async def _run_forecast_on_conditional( + self, question: ConditionalQuestion, research: str + ) -> ReasonedPrediction[ConditionalPrediction]: + parent_info, full_research = await self._get_question_prediction_info( + question.parent, research, "parent" + ) + child_info, full_research = await self._get_question_prediction_info( + question.child, research, "child" + ) + yes_info, full_research = await self._get_question_prediction_info( + question.question_yes, full_research, "yes" + ) + no_info, full_research = await self._get_question_prediction_info( + question.question_no, full_research, "no" + ) + full_reasoning = clean_indents( + f""" + ## Parent Question Reasoning + {parent_info.reasoning} + ## Child Question Reasoning + {child_info.reasoning} + ## Yes Question Reasoning + {yes_info.reasoning} + ## No Question Reasoning + {no_info.reasoning} + """ + ) + full_prediction = ConditionalPrediction( + parent=parent_info.prediction_value, # type: ignore + child=child_info.prediction_value, # type: ignore + prediction_yes=yes_info.prediction_value, # type: ignore + prediction_no=no_info.prediction_value, # type: ignore + ) + return ReasonedPrediction( + reasoning=full_reasoning, prediction_value=full_prediction + ) + + async def _get_question_prediction_info( + self, question: MetaculusQuestion, research: str, question_type: str + ) -> tuple[ReasonedPrediction[PredictionTypes | PredictionAffirmed], str]: + from forecasting_tools.data_models.data_organizer import DataOrganizer + + previous_forecasts = question.previous_forecasts + if ( + question_type in ["parent", "child"] + and previous_forecasts + and question_type not in self.force_reforecast_in_conditional + ): + # TODO: add option to not affirm current parent/child forecasts, create new forecast + previous_forecast = previous_forecasts[-1] + current_utc_time = datetime.now(timezone.utc) + if ( + previous_forecast.timestamp_end is None + or previous_forecast.timestamp_end > current_utc_time + ): + pretty_value = DataOrganizer.get_readable_prediction(previous_forecast) # type: ignore + prediction = ReasonedPrediction( + prediction_value=PredictionAffirmed(), + reasoning=f"Already existing forecast reaffirmed at {pretty_value}.", + ) + return (prediction, research) # type: ignore + info = await self._make_prediction(question, research) + full_research = self._add_reasoning_to_research(research, info, question_type) + return info, full_research # type: ignore + + def _add_reasoning_to_research( + self, + research: str, + reasoning: ReasonedPrediction[PredictionTypes], + question_type: str, + ) -> str: + from forecasting_tools.data_models.data_organizer import DataOrganizer + + question_type = question_type.title() + return clean_indents( + f""" + {research} + --- + ## {question_type} Question Information + You have previously forecasted the {question_type} Question to the value: {DataOrganizer.get_readable_prediction(reasoning.prediction_value)} + This is relevant information for your current forecast, but it is NOT your current forecast, but previous forecasting information that is relevant to your current forecast. + The reasoning for the {question_type} Question was as such: + ``` + {reasoning.reasoning} + ``` + This is absolutely essential: do NOT use this reasoning to re-forecast the {question_type} question. + """ + ) + + def _get_conditional_disclaimer_if_necessary( + self, question: MetaculusQuestion + ) -> str: + if question.conditional_type not in ["yes", "no"]: + return "" + return clean_indents( + """ + As you are given a conditional question with a parent and child, you are to only forecast the **CHILD** question, given the parent question's resolution. + You never re-forecast the parent question under any circumstances, but you use probabilistic reasoning, strongly considering the parent question's resolution, to forecast the child question. + """ + ) + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + + # Suppress LiteLLM logging + litellm_logger = logging.getLogger("LiteLLM") + litellm_logger.setLevel(logging.WARNING) + litellm_logger.propagate = False + + parser = argparse.ArgumentParser( + description="Run the TemplateBot forecasting system" + ) + parser.add_argument( + "--mode", + type=str, + choices=["tournament", "metaculus_cup", "test_questions"], + default="tournament", + help="Specify the run mode (default: tournament)", + ) + args = parser.parse_args() + run_mode: Literal["tournament", "metaculus_cup", "test_questions"] = args.mode + assert run_mode in [ + "tournament", + "metaculus_cup", + "test_questions", + ], "Invalid run mode" + + template_bot = SpringTemplateBot2026( + research_reports_per_question=1, + predictions_per_research_report=5, + use_research_summary_to_forecast=False, + publish_reports_to_metaculus=True, + folder_to_save_reports_to=None, + skip_previously_forecasted_questions=True, + extra_metadata_in_explanation=True, + # llms={ # choose your model names or GeneralLlm llms here, otherwise defaults will be chosen for you + # "default": GeneralLlm( + # model="openrouter/openai/gpt-4o", # "anthropic/claude-sonnet-4-20250514", etc (see docs for litellm) + # temperature=0.3, + # timeout=40, + # allowed_tries=2, + # ), + # "summarizer": "openai/gpt-4o-mini", + # "researcher": "asknews/news-summaries", + # "parser": "openai/gpt-4o-mini", + # }, + ) + + client = MetaculusClient() + if run_mode == "tournament": + # You may want to change this to the specific tournament ID you want to forecast on + seasonal_tournament_reports = asyncio.run( + template_bot.forecast_on_tournament( + client.CURRENT_AI_COMPETITION_ID, return_exceptions=True + ) + ) + minibench_reports = asyncio.run( + template_bot.forecast_on_tournament( + client.CURRENT_MINIBENCH_ID, return_exceptions=True + ) + ) + forecast_reports = seasonal_tournament_reports + minibench_reports + elif run_mode == "metaculus_cup": + # The Metaculus cup is a good way to test the bot's performance on regularly open questions. You can also use AXC_2025_TOURNAMENT_ID = 32564 or AI_2027_TOURNAMENT_ID = "ai-2027" + # The Metaculus cup may not be initialized near the beginning of a season (i.e. January, May, September) + template_bot.skip_previously_forecasted_questions = False + forecast_reports = asyncio.run( + template_bot.forecast_on_tournament( + client.CURRENT_METACULUS_CUP_ID, return_exceptions=True + ) + ) + elif run_mode == "test_questions": + # Example questions are a good way to test the bot's performance on a single question + EXAMPLE_QUESTIONS = [ + "https://www.metaculus.com/questions/578/human-extinction-by-2100/", # Human Extinction - Binary + "https://www.metaculus.com/questions/14333/age-of-oldest-human-as-of-2100/", # Age of Oldest Human - Numeric + "https://www.metaculus.com/questions/22427/number-of-new-leading-ai-labs/", # Number of New Leading AI Labs - Multiple Choice + "https://www.metaculus.com/c/diffusion-community/38880/how-many-us-labor-strikes-due-to-ai-in-2029/", # Number of US Labor Strikes Due to AI in 2029 - Discrete + ] + template_bot.skip_previously_forecasted_questions = False + questions = [ + client.get_question_by_url(question_url) + for question_url in EXAMPLE_QUESTIONS + ] + forecast_reports = asyncio.run( + template_bot.forecast_questions(questions, return_exceptions=True) + ) + template_bot.log_report_summary(forecast_reports) diff --git a/forecasting_tools/forecast_bots/template_bot.py b/forecasting_tools/forecast_bots/template_bot.py index 02d13f89..f76a699f 100644 --- a/forecasting_tools/forecast_bots/template_bot.py +++ b/forecasting_tools/forecast_bots/template_bot.py @@ -1,12 +1,12 @@ -from forecasting_tools.forecast_bots.official_bots.fall_research_only_bot import ( +from forecasting_tools.forecast_bots.official_bots.research_only_bot_2025_fall import ( FallResearchOnlyBot2025, ) -from forecasting_tools.forecast_bots.official_bots.fall_template_bot import ( - FallTemplateBot2025, +from forecasting_tools.forecast_bots.official_bots.template_bot_2026_spring import ( + SpringTemplateBot2026, ) -class TemplateBot(FallTemplateBot2025): +class TemplateBot(SpringTemplateBot2026): pass diff --git a/forecasting_tools/helpers/metaculus_client.py b/forecasting_tools/helpers/metaculus_client.py index 669cb02b..96c488bc 100644 --- a/forecasting_tools/helpers/metaculus_client.py +++ b/forecasting_tools/helpers/metaculus_client.py @@ -108,6 +108,7 @@ class MetaculusClient: AI_COMPETITION_ID_Q1 = 32627 # https://www.metaculus.com/tournament/aibq1/ AI_COMPETITION_ID_Q2 = 32721 # https://www.metaculus.com/tournament/aibq2/ AIB_FALL_2025_ID = 32813 # https://www.metaculus.com/tournament/fall-aib-2025/ + AIB_SPRING_2026_ID = 32916 PRO_COMPARISON_TOURNAMENT_Q1 = 32631 PRO_COMPARISON_TOURNAMENT_Q2 = ( 32761 # https://www.metaculus.com/tournament/pro-benchmark-q22025 @@ -118,6 +119,7 @@ class MetaculusClient: Q1_2025_QUARTERLY_CUP = 32630 METACULUS_CUP_2025_1_ID = 32726 # Summer cup 2025 METACULUS_CUP_FALL_2025_ID = 32828 + METACULUS_CUP_SPRING_2026_ID = None AI_2027_TOURNAMENT_ID = "ai-2027" MAIN_FEED = 144 # site_main @@ -126,7 +128,7 @@ class MetaculusClient: CURRENT_METACULUS_CUP_ID = METACULUS_CUP_FALL_2025_ID CURRENT_QUARTERLY_CUP_ID = CURRENT_METACULUS_CUP_ID # Consider this parameter deprecated since quarterly cup is no longer active - CURRENT_AI_COMPETITION_ID = AIB_FALL_2025_ID + CURRENT_AI_COMPETITION_ID = AIB_SPRING_2026_ID CURRENT_MINIBENCH_ID = "minibench" CURRENT_MARKET_PULSE_ID = Q4_2025_MARKET_PULSE_ID @@ -150,17 +152,6 @@ def __init__( self.sleep_time_between_requests_min = sleep_time_between_requests_min self.sleep_jitter_seconds = sleep_jitter_seconds - @retry_with_exponential_backoff() - def get_current_user_id(self): - self._sleep_between_requests() - response = requests.get( - f"{self.base_url}/users/me", - **self._get_auth_headers(), # type: ignore - ) - raise_for_status_with_additional_info(response) - content = json.loads(response.content) - return content["id"] - @retry_with_exponential_backoff() def post_question_comment( self, @@ -184,102 +175,6 @@ def post_question_comment( logger.info(f"Posted comment on post {post_id}") raise_for_status_with_additional_info(response) - @retry_with_exponential_backoff() - def post_question_link( - self, - question1_id: int, - question2_id: int, - direction: int, - strength: int, - link_type: str, - ) -> int: - """ - Posts a link between questions - :param question1_id - :param question2_id - :param direction: +1 for positive, -1 for negative - :param strength: 1 for low, 2 for medium, 5 for high - :param link_type: only supports "causal" for now - :return: id of the created link - """ - self._sleep_between_requests() - response = requests.post( - f"{self.base_url}/coherence/links/create/", - json={ - "question1_id": question1_id, - "question2_id": question2_id, - "direction": direction, - "strength": strength, - "type": link_type, - }, - **self._get_auth_headers(), # type: ignore - timeout=self.timeout, - ) - logger.info(f"Posted question link between {question1_id} and {question2_id}") - raise_for_status_with_additional_info(response) - content = json.loads(response.content) - return content["id"] - - @retry_with_exponential_backoff() - def get_links_for_question(self, question_id: int) -> List[DetailedCoherenceLink]: - """ - Returns all links associated with a specific question - direction is +1 for positive and -1 for negative - strength is 1 for low, 2 for medium and 5 for high - """ - self._sleep_between_requests() - response = requests.get( - f"{self.base_url}/coherence/question/{question_id}/links/", - **self._get_auth_headers(), # type: ignore - timeout=self.timeout, - ) - raise_for_status_with_additional_info(response) - content = json.loads(response.content)["data"] - links = [ - DetailedCoherenceLink.from_metaculus_api_json(link) for link in content - ] - return links - - @retry_with_exponential_backoff() - def delete_question_link(self, link_id: int): - self._sleep_between_requests() - response = requests.delete( - f"{self.base_url}/coherence/links/{link_id}/delete/", - **self._get_auth_headers(), # type: ignore - timeout=self.timeout, - ) - logger.info(f"Deleted question link with id {link_id}") - raise_for_status_with_additional_info(response) - - @retry_with_exponential_backoff() - def get_needs_update_questions( - self, - question_id: int, - last_datetime: datetime, - user_id_for_links: int | None = None, - ) -> NeedsUpdateResponse: - self._sleep_between_requests() - json_data: dict[str, Any] = {"datetime": last_datetime.isoformat()} - if user_id_for_links: - json_data["user_id_for_links"] = user_id_for_links - response = requests.get( - f"{self.base_url}/coherence/question/{question_id}/links/needs-update/", - **self._get_auth_headers(), # type: ignore - timeout=self.timeout, - json=json_data, - ) - raise_for_status_with_additional_info(response) - content = json.loads(response.content) - questions = [ - DataOrganizer.get_question_from_question_json(json_q) - for json_q in content["questions"] - ] - links = [ - CoherenceLink.model_validate(json_link) for json_link in content["links"] - ] - result = NeedsUpdateResponse(questions=questions, links=links) - return result - def post_binary_question_prediction( self, question_id: int, prediction_in_decimal: float ) -> None: @@ -504,6 +399,113 @@ def get_benchmark_questions( questions = typeguard.check_type(questions, list[BinaryQuestion]) return questions + @retry_with_exponential_backoff() + def get_current_user_id(self) -> int: + self._sleep_between_requests() + response = requests.get( + f"{self.base_url}/users/me", + **self._get_auth_headers(), # type: ignore + ) + raise_for_status_with_additional_info(response) + content = json.loads(response.content) + return int(content["id"]) + + @retry_with_exponential_backoff() + def post_question_link( + self, + question1_id: int, + question2_id: int, + direction: int, + strength: int, + link_type: str, + ) -> int: + """ + Posts a link between questions + :param question1_id + :param question2_id + :param direction: +1 for positive, -1 for negative + :param strength: 1 for low, 2 for medium, 5 for high + :param link_type: only supports "causal" for now + :return: id of the created link + """ + self._sleep_between_requests() + response = requests.post( + f"{self.base_url}/coherence/links/create/", + json={ + "question1_id": question1_id, + "question2_id": question2_id, + "direction": direction, + "strength": strength, + "type": link_type, + }, + **self._get_auth_headers(), # type: ignore + timeout=self.timeout, + ) + logger.info(f"Posted question link between {question1_id} and {question2_id}") + raise_for_status_with_additional_info(response) + content = json.loads(response.content) + return content["id"] + + @retry_with_exponential_backoff() + def get_links_for_question(self, question_id: int) -> List[DetailedCoherenceLink]: + """ + Returns all links associated with a specific question + direction is +1 for positive and -1 for negative + strength is 1 for low, 2 for medium and 5 for high + """ + self._sleep_between_requests() + response = requests.get( + f"{self.base_url}/coherence/question/{question_id}/links/", + **self._get_auth_headers(), # type: ignore + timeout=self.timeout, + ) + raise_for_status_with_additional_info(response) + content = json.loads(response.content)["data"] + links = [ + DetailedCoherenceLink.from_metaculus_api_json(link) for link in content + ] + return links + + @retry_with_exponential_backoff() + def delete_question_link(self, link_id: int): + self._sleep_between_requests() + response = requests.delete( + f"{self.base_url}/coherence/links/{link_id}/delete/", + **self._get_auth_headers(), # type: ignore + timeout=self.timeout, + ) + logger.info(f"Deleted question link with id {link_id}") + raise_for_status_with_additional_info(response) + + @retry_with_exponential_backoff() + def get_needs_update_questions( + self, + question_id: int, + last_datetime: datetime, + user_id_for_links: int | None = None, + ) -> NeedsUpdateResponse: + self._sleep_between_requests() + json_data: dict[str, Any] = {"datetime": last_datetime.isoformat()} + if user_id_for_links: + json_data["user_id_for_links"] = user_id_for_links + response = requests.get( + f"{self.base_url}/coherence/question/{question_id}/links/needs-update/", + **self._get_auth_headers(), # type: ignore + timeout=self.timeout, + json=json_data, + ) + raise_for_status_with_additional_info(response) + content = json.loads(response.content) + questions = [ + DataOrganizer.get_question_from_question_json(json_q) + for json_q in content["questions"] + ] + links = [ + CoherenceLink.model_validate(json_link) for json_link in content["links"] + ] + result = NeedsUpdateResponse(questions=questions, links=links) + return result + def _get_auth_headers(self) -> dict[str, dict[str, str]]: METACULUS_TOKEN = os.getenv("METACULUS_TOKEN") if METACULUS_TOKEN is None: diff --git a/pyproject.toml b/pyproject.toml index 94aa43e6..c6327418 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "forecasting-tools" -version = "0.2.79" +version = "0.2.80" description = "AI forecasting and research tools to help humans reason about and forecast the future" authors = ["Benjamin Wilson "] license = "MIT" diff --git a/run_bots.py b/run_bots.py index 522d10e8..b3fb197e 100644 --- a/run_bots.py +++ b/run_bots.py @@ -20,12 +20,12 @@ from forecasting_tools.data_models.forecast_report import ForecastReport from forecasting_tools.data_models.questions import DateQuestion, MetaculusQuestion from forecasting_tools.forecast_bots.forecast_bot import ForecastBot -from forecasting_tools.forecast_bots.official_bots.fall_research_only_bot import ( - FallResearchOnlyBot2025, -) from forecasting_tools.forecast_bots.official_bots.gpt_4_1_optimized_bot import ( GPT41OptimizedBot, ) +from forecasting_tools.forecast_bots.official_bots.research_only_bot_2025_fall import ( + FallResearchOnlyBot2025, +) from forecasting_tools.forecast_bots.official_bots.uniform_probability_bot import ( UniformProbabilityBot, )