import math
def approximate_number_of_input_tokens_for_input(
number_of_tokens_in_text: int,
number_of_boilerplate_tokens: int,
# Chunking-specific parameters (only applicable if chunking is enabled)
chunk_size: int | None = None,
chunk_overlap_ratio: float | None = None,
# Statement-specific parameters (only applicable if using IQL)
number_of_tokens_in_longest_statement: int | None = None,
average_number_of_tokens_in_statements: int | None = None,
number_of_statements: int | None = None,
) -> int:
if (
len(
{
number_of_tokens_in_longest_statement,
average_number_of_tokens_in_statements,
number_of_statements,
}
)
!= 1
):
raise ValueError("You can either provide all of the statement-specific parameters or none of them.")
elif number_of_tokens_in_longest_statement is None:
number_of_tokens_in_longest_statement = 0
average_number_of_tokens_in_statements = 0
number_of_statements = 1
if (chunk_size is None) != (chunk_overlap_ratio is None):
raise ValueError("You can either provide both chunk_size and chunk_overlap_ratio or neither of them.")
if chunk_size is None:
number_of_chunks = 1
else:
effective_chunk_size = chunk_size - number_of_boilerplate_tokens - number_of_tokens_in_longest_statement
number_of_chunks = math.ceil(number_of_tokens_in_text / effective_chunk_size) * (1 + chunk_overlap_ratio)
approximate_number_of_input_tokens_for_input = (
(
(number_of_tokens_in_text / number_of_chunks)
+ number_of_boilerplate_tokens
+ average_number_of_tokens_in_statements
)
* number_of_statements
* number_of_chunks
)
return approximate_number_of_input_tokens_for_input