Skip to content

Commit 78004c3

Browse files
committed
update
1 parent f39a4b7 commit 78004c3

3 files changed

Lines changed: 164 additions & 1 deletion

File tree

opencompass/configs/datasets/molculariq/molculariq_gen.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@
152152
abbr=f'MolecularIQ-{_name}',
153153
type=MoleculariqDataset,
154154
name=_name,
155-
path='MolecularIQ/test_task',
155+
path='opencompass/MolecularIQ',
156156
reader_cfg=moleculariq_reader_cfg,
157157
infer_cfg=moleculariq_infer_cfg,
158158
eval_cfg=moleculariq_eval_cfg,
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
from opencompass.openicl.icl_raw_prompt_template import RawPromptTemplate
2+
from opencompass.openicl.icl_retriever import ZeroRetriever
3+
from opencompass.openicl.icl_inferencer import GenInferencer
4+
from opencompass.datasets.moleculariq import MoleculariqDataset
5+
from opencompass.datasets.moleculariq import (
6+
MoleculariqCountEvaluator,
7+
MoleculariqIndexEvaluator,
8+
MoleculariqGenerationEvaluator,
9+
)
10+
11+
moleculariq_reader_cfg = dict(
12+
input_columns=['prompt'],
13+
output_column='ground_truth',
14+
)
15+
16+
# 论文 Table 1 使用的 system prompt(来自论文附录 B.2.2)
17+
system_prompt = """You are an expert chemist. Answer molecular property, understanding, structural analysis and molecular generation questions precisely and accurately.
18+
19+
CRITICAL: Only content within<answer></answer> tags will be extracted. ALWAYS return JSON format.
20+
21+
KEY REQUIREMENT: Use EXACT key names from the question. Never modify or invent keys.
22+
23+
INDEXING: Atoms are indexed from 0 to the end of the SMILES string from left to right. Only heavy atoms (skip [H], include [2H]/[3H]).
24+
Examples:
25+
- "CCO": C(0), C(1), O(2)
26+
- "CC(C)O": C(0), C(1), C(2), O(3)
27+
- "CC(=O)N": C(0), C(1), O(2), N(3)
28+
29+
ABSENT FEATURES: Use 0 for counts, [] for indices. Never null or omit.
30+
31+
ALWAYS USE JSON with EXACT keys from the question:
32+
33+
Single count (key from question: "alcohol count"):<answer>"alcohol count": 2</answer>
34+
<answer>"alcohol count": 0</answer> (if absent)
35+
36+
Single index (key from question: "ketone indices"):<answer>"ketone indices": [5]</answer>
37+
<answer>"ketone indices": []</answer> (if absent)
38+
39+
Multiple properties (keys from question: "ring count", "halogen indices"):<answer>"ring count": 2, "halogen indices": [3, 7]</answer>
40+
<answer>"ring count": 0, "halogen indices": []</answer> (if all absent)
41+
42+
Constraint generation:<answer>"smiles": "CC(O)C"</answer>
43+
44+
Include ALL requested properties. Never null or omit."""
45+
46+
# questions.py: with_key_hints
47+
system_prompt_with_key_hints = """You are an expert chemist specializing in molecular understanding, property calculations, structural analysis and molecular generation.
48+
49+
CRITICAL: Only content within <answer></answer> tags will be extracted as your response. Everything outside these tags is ignored.
50+
51+
KEY REQUIREMENT: Always use the EXACT key names provided in the question. Do not modify or create your own keys.
52+
53+
IMPORTANT: If a requested feature is not present in the molecule, you MUST return 0 for counts or [] for indices. Never null or omit.
54+
55+
INDEXING RULES:
56+
- Atom indices are 0-based
57+
- Atoms are numbered from 0 in the order they appear in the SMILES string from left to right
58+
- Regular hydrogens (implicit or explicit [H]) are NOT indexed
59+
- Isotopes ([2H], [3H]) ARE indexed as they appear
60+
- Examples:
61+
- "CCO": C(0), C(1), O(2)
62+
- "CC(C)O": C(0), C(1), C(2), O(3)
63+
- "CC(=O)N": C(0), C(1), O(2), N(3)
64+
65+
For SINGLE COUNT tasks:
66+
- Return a JSON object with the EXACT key from the question
67+
- Return 0 if the feature is absent
68+
- Examples: <answer>{"alcohol_group_count": 2}</answer>
69+
- For absent features: <answer>{"alcohol_group_count": 0}</answer>
70+
71+
For SINGLE INDEX tasks:
72+
- Return a JSON object with the EXACT key from the question
73+
- Return empty list [] if the feature is absent
74+
- Examples: <answer>{"alcohol_group_indices": [3, 7]}</answer>
75+
- For absent features: <answer>{"alcohol_group_indices": []}</answer>
76+
77+
For MULTIPLE COUNT tasks with key hints:
78+
- Return a JSON object using the EXACT keys provided
79+
- Each key maps to an integer count (0 if absent)
80+
- Example: <answer>{"alcohol_group_count": 2, "ketone_group_count": 0}</answer>
81+
82+
For MULTIPLE INDEX tasks with key hints:
83+
- Return a JSON object using the EXACT keys provided
84+
- Each key maps to a list of indices (empty list [] if absent)
85+
- Example: <answer>{"alcohol_group_indices": [3, 7], "ketone_group_indices": []}</answer>
86+
87+
For CONSTRAINT GENERATION tasks:
88+
- Return a JSON object with "smiles" as the key
89+
- Example: <answer>{"smiles": "CC(=O)CC(O)C"}</answer>"""
90+
91+
# questions.py: concise
92+
system_prompt_concise = """You are an expert chemist. Answer molecular property, understanding, structural analysis and molecular generation questions precisely and accurately.
93+
94+
CRITICAL: Only content within <answer></answer> tags will be extracted. ALWAYS return JSON format.
95+
96+
KEY REQUIREMENT: Use EXACT key names from the question. Never modify or invent keys.
97+
98+
INDEXING: Atoms are indexed from 0 to the end of the SMILES string from left to right. Only heavy atoms (skip [H], include [2H]/[3H]).
99+
Examples:
100+
- "CCO": C(0), C(1), O(2)
101+
- "CC(C)O": C(0), C(1), C(2), O(3)
102+
- "CC(=O)N": C(0), C(1), O(2), N(3)
103+
104+
ABSENT FEATURES: Use 0 for counts, [] for indices. Never null or omit.
105+
106+
ALWAYS USE JSON with EXACT keys from the question:
107+
108+
Single count (key from question: "alcohol_count"):
109+
<answer>{"alcohol_count": 2}</answer>
110+
<answer>{"alcohol_count": 0}</answer> (if absent)
111+
112+
Single index (key from question: "ketone_indices"):
113+
<answer>{"ketone_indices": [5]}</answer>
114+
<answer>{"ketone_indices": []}</answer> (if absent)
115+
116+
Multiple properties (keys from question: "ring_count", "halogen_indices"):
117+
<answer>{"ring_count": 2, "halogen_indices": [3, 7]}</answer>
118+
<answer>{"ring_count": 0, "halogen_indices": []}</answer> (if all absent)
119+
120+
Constraint generation:
121+
<answer>{"smiles": "CC(O)C"}</answer>
122+
123+
Include ALL requested properties. Never null or omit."""
124+
125+
_evaluator_map = {
126+
'count': MoleculariqCountEvaluator,
127+
'index': MoleculariqIndexEvaluator,
128+
'generation': MoleculariqGenerationEvaluator,
129+
}
130+
131+
moleculariq_datasets = []
132+
for _name in ['count', 'index', 'generation']:
133+
moleculariq_infer_cfg = dict(
134+
prompt_template=dict(
135+
type=RawPromptTemplate,
136+
messages=[
137+
{'role': 'system', 'content': system_prompt},
138+
{'role': 'user', 'content': '{prompt}'},
139+
],
140+
),
141+
retriever=dict(type=ZeroRetriever),
142+
inferencer=dict(type=GenInferencer),
143+
)
144+
moleculariq_eval_cfg = dict(
145+
evaluator=dict(type=_evaluator_map[_name]),
146+
)
147+
148+
moleculariq_datasets.append(
149+
dict(
150+
abbr=f'MolecularIQ-{_name}',
151+
type=MoleculariqDataset,
152+
name=_name,
153+
path='opencompass/MolecularIQ',
154+
reader_cfg=moleculariq_reader_cfg,
155+
infer_cfg=moleculariq_infer_cfg,
156+
eval_cfg=moleculariq_eval_cfg,
157+
)
158+
)

opencompass/utils/datasets_info.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
DATASETS_MAPPING = {
2+
"opencompass/MolecularIQ":{
3+
"ms_id": None,
4+
"hf_id": None,
5+
"local": "./data/MolecularIQ/test_task",
6+
},
27
"opencompass/IMO-Answer-Bench": {
38
"ms_id": None,
49
"hf_id": None,

0 commit comments

Comments
 (0)