Assume yourself as a great NLP and AI researcher ...
생성일: 2025년 7월 15일
생성일: 2025년 7월 15일
Assume yourself as a great NLP and AI researcher
def compute_reward(prompts, completions, answer, **kwargs) -> list[float]:
responses = [completion[0]['content'] for completion in completions]
q = prompts[0][-1]['content']
rewards = []
for res, ans in zip(responses, answer):
if res == ans :
reward = 1.5
rewards.append(reward)
continue
reward = 0.0
rouge_l = compute_rouge_l(res, ans)
bleu = compute_bleu(res, ans)
reward += (rouge_l)0.6 + bleu0.4
rewards.append(reward)
print(f"len of rewards : f{len(rewards)} and avg reward {sum(rewards)/len(rewards)}")
return rewards
def length_reward(prompts, completions, answer, **kwargs) -> list[float]:
responses = [completion[0]['content'] for completion in completions]
q = prompts[0][-1]['content']
rewards = []
avg_answer_len = np.mean([len(ans.split()) for ans in answer])
sigma = avg_answer_len * 0.25
for res, ans in zip(responses, answer):
len_res = len(res.split())
len_ans = len(ans.split())
len_diff = abs(len_res - len_ans)
# Gaussian reward function: e^(-(x^2) / (2 * sigma^2))
# This gives a reward of 1.0 for a perfect match, smoothly decreasing to 0.
reward = np.exp(-0.5 * (len_diff / sigma) ** 2)
rewards.append(reward)
return rewards
import re
def drug_adverse_preservation_reward(prompts, completions, answer, drug_names, physical_adverse_effects, non_pdv, **kwargs) -> list[float]:
responses = [completion[0]['content'] for completion in completions]
rewards = []
for res, pmpt, drugs_str, physical_effects_str, non_pdv_str in zip(
responses, prompts, drug_names, physical_adverse_effects, non_pdv
):
res_lower = res.lower()
drug_set = {d.strip().lower() for d in drugs_str.split(',') if d.strip()}
all_drugs_found = all(drug in res_lower for drug in drug_set)
drug_reward = 1.0 if all_drugs_found else 0.0
is_physical_prompt = 'physical_adverse_effects:' in pmpt
adverse_effects_str = physical_effects_str if is_physical_prompt else non_pdv_str
adverse_set = {a.strip().lower() for a in adverse_effects_str.split(',') if a.strip()}
all_adverse_effects_found = all(effect in res_lower for effect in adverse_set)
adverse_reward = 1.0 if all_adverse_effects_found else 0.0
total_reward = drug_reward + adverse_reward
print(total_reward)
rewards.append(total_reward)
return rewards
these are the reward functions I created for GRPO training, other than this can you come up new reward function.
Below are four complementary reward functions you can layer on top-of (or mix-and-match with) the ones you already wrote. Each addresses a specific blind-spot of n-gram or length-based metrics and is easy to implement with publicly-available models / libraries.
When to use: open-ended generation, paraphrase tasks, abstractive summarisation, QA with wording variation.
pythonfrom bert_score import score # pip install bert-score import torch def semantic_similarity_reward(prompts, completions, answer, model_type="microsoft/deberta-large-mnli", **kwargs): responses = [c[0]['content'] for c in completions] # bert-score works batched P, R, F1 = score(responses, answer, model_type=model_type, lang="en", verbose=False, idf=False, num_layers=17, device="cuda" if torch.cuda.is_available() else "cpu") # return F1 as reward in [0,1] return F1.cpu().tolist()
Why it helps: BERTScore uses contextual embeddings so it credits paraphrases that BLEU/ROUGE miss and correlates better with human judgements for modern LLM outputs. Comet
When to use: summarisation, data-to-text, or any task where hallucination is a concern.
pythonfrom transformers import AutoTokenizer, AutoModelForSequenceClassification import torch, torch.nn.functional as F tok = AutoTokenizer.from_pretrained("manueldeprada/FactCC") clf = AutoModelForSequenceClassification.from_pretrained("manueldeprada/FactCC").eval() def factual_consistency_reward(source_texts, completions, **kwargs): """source_texts: list[str] same length as completions, each is the ground-truth context""" responses = [c[0]['content'] for c in completions] rewards = [] for src, hyp in zip(source_texts, responses): inputs = tok(src, hyp, truncation=True, padding=True, return_tensors="pt") with torch.no_grad(): logit = clf(**inputs).logits.squeeze() # [contradict, consistent] prob_consistent = F.softmax(logit, dim=-1)[1].item() rewards.append(prob_consistent) # in [0,1] return rewards
Why it helps: FactCC-style entailment classifiers explicitly punish contradictions and hallucinations that lexical metrics can’t detect.
When to use: conversational agents, open-domain generation where safety matters.
Return 1 – toxicity probability so higher is better.
pythonfrom detoxify import Detoxify # pip install detoxify==0.5.2 tox_model = Detoxify('original') def toxicity_penalty_reward(prompts, completions, **kwargs): responses = [c[0]['content'] for c in completions] preds = tox_model.predict(responses) # dict of arrays toxic_scores = preds['toxicity'] # ↗ probability in [0,1] return [1.0 - t for t in toxic_scores] # reward, 0 (very toxic) → 1 (clean)
Why it helps: Reinforcing low-toxicity outputs reduces unsafe generations without hand-written filters. 1search0
When to use: creative writing, dialogue systems, batch decoding where mode-collapse or bland repetition hurts UX.
pythonfrom collections import Counter import math def diversity_reward(prompts, completions, n=4, **kwargs): responses = [c[0]['content'] for c in completions] rewards = [] for res in responses: tokens = res.split() total_ngrams = len(tokens) - n + 1 if total_ngrams <= 0: rewards.append(0.0) continue ngrams = [' '.join(tokens[i:i+n]) for i in range(total_ngrams)] unique = len(set(ngrams)) rewards.append(unique / total_ngrams) # proportion of unique n-grams, ∈(0,1] return rewards
Why it helps: Directly combats short loops or copy-based extrema PPO sometimes falls into by rewarding lexical variety (you could swap in self-BLEU, distinct-1/2/3, or entropy).
You can compose rewards linearly or dynamically:
pythontotal_reward = ( 0.4 * semantic_similarity + 0.3 * factual_consistency + 0.2 * length_reward # your existing Gaussian length match + 0.1 * diversity_reward ) * toxicity_penalty # multiplicative safety umbrella
Weights can be tuned via grid search or Bayesian optimisation against a small set of human preference data.
[0, 1]
before mixing; clip outliers.Feel free to mix-and-match or specialise further (e.g. readability reward via Flesch–Kincaid, style match reward with a domain classifier, keyword coverage reward for slot-filling). The four above give you a solid, complementary reward suite that targets semantics, factuality, safety and diversity—common pain-points in modern LLM fine-tuning workflows.
i would like make a hallucination_penalty_reward function, please give make robust and elegant function. you can explore semantic match or word match. you have that liberty do your best please
Below is a self-contained, “plug-and-play” reward function that penalises hallucinations by blending two complementary signals — lexical precision (How many content words in the response actually appear in the reference?) and semantic entailment (Does each sentence in the response follow from the reference according to an NLI model?).
Both subscores are in (1 = no hallucination), then linearly combined. Feel free to change the weights alpha
and beta
if you want to emphasise one view over the other.
pythonimport numpy as np import torch, torch.nn.functional as F from transformers import AutoTokenizer, AutoModelForSequenceClassification import spacy # ---------- heavy-weight models are loaded only once ---------- _nli_tok = AutoTokenizer.from_pretrained("facebook/bart-large-mnli") _nli_model = AutoModelForSequenceClassification.from_pretrained( "facebook/bart-large-mnli" ).eval() _spacy_nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"]) # label index for ENTALMENT in BART-MNLI _ENTAIL_IDX = _nli_model.config.label2id["ENTAILMENT"] # ---------- helper: lexical precision on content words ---------- def _lexical_precision(resp: str, ref: str) -> float: doc_r = _spacy_nlp(resp.lower()) doc_ref = _spacy_nlp(ref.lower()) # keep only content words (nouns, verbs, adjectives, proper nouns, numbers) keep = {"NOUN", "VERB", "ADJ", "PROPN", "NUM"} words_r = {t.lemma_ for t in doc_r if t.pos_ in keep} words_ref = {t.lemma_ for t in doc_ref if t.pos_ in keep} if not words_r: return 0.0 correct = words_r & words_ref precision = len(correct) / len(words_r) # fraction of content words supported return precision # ---------- helper: mean entailment probability for sentences ---------- def _mean_entailment(resp: str, ref: str, device=None) -> float: if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" _nli_model.to(device) # split response into sentences via spaCy sent_docs = list(_spacy_nlp(resp).sents) if not sent_docs: return 0.0 inputs = _nli_tok( [ref] * len(sent_docs), # premise [s.text for s in sent_docs], # hypothesis padding=True, truncation=True, return_tensors="pt" ).to(device) with torch.no_grad(): logits = _nli_model(**inputs).logits[:, _ENTAIL_IDX] # logits for ENT probs = F.softmax(logits, dim=-1).cpu().numpy() return float(np.mean(probs)) # average entailment prob. # ---------- main reward ---------- def hallucination_penalty_reward( prompts, completions, answers, alpha: float = 0.5, # weight on lexical precision beta : float = 0.5, # weight on entailment clip: bool = True, # clip to [0,1] **kwargs ) -> list[float]: """ Returns a *reward* in [0,1] for each (completion, answer) pair: 1.0 == no hallucination, 0.0 == severe hallucination. """ assert abs(alpha + beta - 1.0) < 1e-6, "alpha + beta must equal 1" responses = [c[0]['content'] for c in completions] rewards = [] for res, ref in zip(responses, answers): lex_prec = _lexical_precision(res, ref) entail = _mean_entailment(res, ref) reward = alpha * lex_prec + beta * entail if clip: reward = max(0.0, min(1.0, reward)) rewards.append(reward) return rewards
Component | Captures | Failure mode it protects against |
---|---|---|
Lexical precision (content-word overlap) | Extra facts, numbers, names that are literally not in the reference | “Invented” entities, dates, dosages etc. |
Entailment probability (NLI) | Paraphrased or re-ordered statements that must still be supported by the reference | Deep-paraphrase hallucinations that slip past n-gram checks |
By averaging the per-sentence entailment you avoid one rogue statement tanking the whole score while still discouraging unsupported claims.
Both subscores are differentiable wrt logits (you can back-prop through the NLI if needed) and quick enough for PPO/GRPO loops.
alpha
,beta
accordingly._mean_entailment
so the model is judged against its evidence, not a gold summary.Feel free to refactor or extend (e.g. include knowledge-base verification for numbers, or a stricter contradiction-penalty term). This scaffold should nonetheless give you a robust, elegant starting-point for discouraging hallucinated content during RL fine-tuning.
can you put the range in -1 to 1, so that hallucinations are penalized
Here’s a tidy drop-in that maps the score to , where +1 means “fully grounded / zero hallucination” and -1 means “severely hallucinated”.
All the internals stay the same; only the final rescaling changes.
pythonimport numpy as np import torch, torch.nn.functional as F from transformers import AutoTokenizer, AutoModelForSequenceClassification import spacy # ---------- one-off model / pipeline setup ---------- _nli_tok = AutoTokenizer.from_pretrained("facebook/bart-large-mnli") _nli_model = AutoModelForSequenceClassification.from_pretrained( "facebook/bart-large-mnli" ).eval() _spacy_nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"]) _ENTAIL_IDX = _nli_model.config.label2id["ENTAILMENT"] # ---------- helpers ---------- def _lexical_precision(resp: str, ref: str) -> float: """Content-word precision in [0,1].""" keep = {"NOUN", "VERB", "ADJ", "PROPN", "NUM"} r = {t.lemma_ for t in _spacy_nlp(resp.lower()) if t.pos_ in keep} rf = {t.lemma_ for t in _spacy_nlp(ref.lower()) if t.pos_ in keep} if not r: return 0.0 return len(r & rf) / len(r) def _mean_entailment(resp: str, ref: str, device=None) -> float: """Average entailment probability in [0,1] for each sentence in `resp`.""" if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" _nli_model.to(device) sents = list(_spacy_nlp(resp).sents) if not sents: return 0.0 inputs = _nli_tok([ref] * len(sents), [s.text for s in sents], padding=True, truncation=True, return_tensors="pt").to(device) with torch.no_grad(): logits = _nli_model(**inputs).logits[:, _ENTAIL_IDX] probs = F.softmax(logits, dim=-1).cpu().numpy() # entailment probs return float(np.mean(probs)) # ---------- main reward ---------- def hallucination_penalty_reward( prompts, completions, answers, alpha: float = 0.5, # weight for lexical precision beta : float = 0.5, # weight for entailment clip: bool = True, **kwargs ) -> list[float]: """ Reward in [-1, 1]. +1 → perfect grounding 0 → borderline (half supported) -1 → heavy hallucination """ assert abs(alpha + beta - 1.0) < 1e-6, "alpha + beta must equal 1" responses = [c[0]['content'] for c in completions] rewards = [] for res, ref in zip(responses, answers): # 1. Compute base score in [0,1] lex_prec = _lexical_precision(res, ref) entail = _mean_entailment(res, ref) score_01 = alpha * lex_prec + beta * entail # 2. Rescale to [-1,1] reward = 2.0 * score_01 - 1.0 if clip: reward = max(-1.0, min(1.0, reward)) rewards.append(reward) return rewards
textscore_01 = 0.0 → reward = -1 (strong hallucination) score_01 = 0.5 → reward = 0 (on-the-fence) score_01 = 1.0 → reward = +1 (fully factual)
You can still adjust alpha
/beta
, add contradiction penalties, or multiply by a toxicity mask exactly as before—the final step just linearly shifts the interval to meet your requirement.
respo = ['Taxol may lead to non-physical adverse effects such as a feeling of frustration or hopelessness, often described as a sense of cant catch a break.' ]
answe = ['Taxol, Xeloda, paracetomol, has been associated with adverse effects including pain in the left hand/thumb, a port protruding through the skin, and the necessity of a brace on the hand.']
{'adverse', 'effect'}
{'feeling', 'frustration', 'non', 'effect', 'hopelessness', 'catch', 'break', 'adverse', 'describe', 'physical', 'sense', 'lead', '-', 'such'}
[(0.1428571428571428,)]
your code unable to capture drug names present in the text
In the earlier helper I lower-cased the entire string before sending it through spaCy:
pythondoc_r = _spacy_nlp(resp.lower())
Lower-casing destroys the capitalisation cues that spaCy’s tagger relies on to label
“Taxol”, “Xeloda”, “Paracetomol”, … as PROPN
.
Consequently they never make it into the {t.pos_ in keep}
filter, so your set is just
{'adverse','effect'}
.
pythonimport re, numpy as np, torch, torch.nn.functional as F import spacy from transformers import AutoTokenizer, AutoModelForSequenceClassification # -------------------------------------------------------------------- # One-off model setup nlp = spacy.load("en_core_web_sm", disable=["parser","ner","textcat"]) tok = AutoTokenizer.from_pretrained("facebook/bart-large-mnli") nli = AutoModelForSequenceClassification.from_pretrained( "facebook/bart-large-mnli").eval() ENTAIL = nli.config.label2id["ENTAILMENT"] # Optional tiny drug lexicon (add your own or plug RxNorm) DRUG_VOCAB = { "taxol", "xeloda", "paracetomol", "paracetamol", "paclitaxel", "acetaminophen" } drug_pat = re.compile(r"\b(" + "|".join(map(re.escape, DRUG_VOCAB)) + r")\b", re.I) # -------------------------------------------------------------------- def _bag_of_content(text: str) -> set[str]: """Content lemmas + drug names, all lower-cased.""" doc = nlp(text) # **keep original case here** keep_pos = {"NOUN", "VERB", "ADJ", "PROPN", "NUM"} bag = {t.lemma_.lower() for t in doc if t.pos_ in keep_pos} # very light drug matcher bag.update(m.group(0).lower() for m in drug_pat.finditer(text)) return bag def _lexical_precision(resp: str, ref: str) -> float: words_r, words_ref = _bag_of_content(resp), _bag_of_content(ref) return 0.0 if not words_r else len(words_r & words_ref) / len(words_r) def _mean_entailment(resp: str, ref: str, device=None) -> float: if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" nli.to(device) sents = list(nlp(resp).sents) if not sents: return 0.0 inputs = tok([ref]*len(sents), [s.text for s in sents], truncation=True, padding=True, return_tensors="pt").to(device) with torch.no_grad(): ent = nli(**inputs).logits[:, ENTAIL] return float(torch.softmax(ent, -1).mean().item()) def hallucination_penalty_reward( prompts, completions, answers, alpha=0.5, beta=0.5, clip=True, **kw ) -> list[float]: """ Reward in [-1, +1] ( +1 = fully grounded, -1 = heavy hallucination ) """ resps = [c[0]["content"] for c in completions] scores = [] for r, a in zip(resps, answers): s01 = alpha * _lexical_precision(r, a) + beta * _mean_entailment(r, a) reward = 2*s01 - 1 # map [0,1] → [-1,1] scores.append(max(-1,min(1,reward)) if clip else reward) return scores
pythonresp = "Taxol may lead to non-physical adverse effects such as frustration." ans = "Taxol, Xeloda, Paracetomol have adverse effects including hand pain." print(_lexical_precision(resp, ans)) # ➞ 0.50 (Taxol + adverse + effect matched)
_bag_of_content
now keeps “Taxol” (capitalisation intact) and also finds any
drug names listed in DRUG_VOCAB
even if spaCy mis-tags them.
en_core_web_sm
for a SciSpaCy model
(en_core_sci_lg
, en_ner_bc5cdr_md
, …) and replace the lexicon regex with
ent.label_ in {"CHEMICAL", "DRUG"}
.from rapidfuzz import fuzz
to catch minor typos (“Paracetomol” vs
“Paracetamol”).DRUG_VOCAB
from your training corpus once, then
cache it.With this small edit your reward now correctly credits / penalises outputs for mentioning (or hallucinating) the relevant drug names.