Voozh

training script: https://gist.github.com/notlober/9bf4c3ab6ddeb12ec669ca495653708a inference code:

from transformers import AutoModelForCausalLM, AutoTokenizer

#####
max_new_toks = 2048
N_BEAMS = 5
#####

def do_instruct(prompt):
 return f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first does reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {prompt} Assistant:"

def generate_output_once(prompt):
 message = [
 {"role": "user", "content": do_instruct(prompt)}
 ]
 text = tokenizer.apply_chat_template(
 message,
 tokenize=False,
 add_generation_prompt=True
 )
 model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

 generated_ids = model.generate(
 model_inputs,
 max_new_tokens=max_new_toks,
 repetition_penalty=1.2,
 num_beam_groups=N_BEAMS,
 num_beams=N_BEAMS,
 diversity_penalty=0.5,
 early_stopping=True,
 do_sample=False # do not set to True if you get a warning, skip it
 )
 return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def test_gen(prompt):
 answer_str = generate_output_once(prompt)
 print(f"Answer: {answer_str}")

#####
model = AutoModelForCausalLM.from_pretrained(
 "notbdq/gemma-grpo",
 torch_dtype="auto",
 device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct-1M")
#####

test_gen("...") # put your prompt here

benchmarks: its definitely better than qwen 14b 1m, but i have only tested for 15 samples of aime validation set and it was doing better than qwen 2.5 1m since first sample but there are 75 samples more so i am sharing the script so someone can benchmark it if wants:

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

#####
max_new_toks = 2048
N_BEAMS = 5
#####

def do_instruct(prompt):
 return f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first does reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {prompt} Assistant:"

def generate_output_once_grpo(model, prompt):
 message = [
 {"role": "user", "content": do_instruct(prompt)}
 ]
 text = tokenizer.apply_chat_template(
 message,
 tokenize=False,
 add_generation_prompt=True
 )
 model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

 generated_ids = model.generate(
 model_inputs,
 max_new_tokens=max_new_toks,
 repetition_penalty=1.2,
 num_beam_groups=N_BEAMS,
 num_beams=N_BEAMS,
 diversity_penalty=0.5,
 early_stopping=True,
 do_sample=False # do not set to True if you get a warning, skip it
 )
 return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def generate_output_once(model, prompt):
 message = [
 {"role": "user", "content": prompt}
 ]
 text = tokenizer.apply_chat_template(
 message,
 tokenize=False,
 add_generation_prompt=True
 )
 model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

 generated_ids = model.generate(
 model_inputs,
 max_new_tokens=max_new_toks
 )
 return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def check_model_contain_output(model_output, ground_t_output):
 if ground_t_output in model_output:
 return True
 return False

def extract_answer(text):
 try: return text.split("<answer>")[1].split("</answer>")[0]
 except: return None

def do_eval(debug):
 total_iters = len(eval_dataset)
 wins_reasoning = 0
 wins_qwen = 0
 for l in range(len(eval_dataset)):
 row = eval_dataset[l]
 problem = row["problem"]
 ground_truth = row["answer"]
 response = generate_output_once_grpo(model, problem)
 response_qwen = generate_output_once(model_qwen, problem)
 reward = check_model_contain_output(response, ground_truth)
 reward_qwen = check_model_contain_output(response_qwen, ground_truth)
 if reward: wins_reasoning += 1
 if reward_qwen: wins_qwen += 1
 print(f"reasoning model: %{wins_reasoning / total_iters}")
 print(f"qwen model: %{wins_qwen / total_iters}")
 if debug:
 print("qwen:", response_qwen)
 print("reasoning fine tuned:", response)

#####
model = AutoModelForCausalLM.from_pretrained(
 "notbdq/gemma-grpo",
 torch_dtype="auto",
 device_map="auto"
)
model_qwen = AutoModelForCausalLM.from_pretrained(
 "Qwen/Qwen2.5-14B-Instruct-1M",
 torch_dtype="auto",
 device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct-1M")
eval_dataset = load_dataset("AI-MO/aimo-validation-aime", split="train")
#####

do_eval(debug=False)