Skip to content

Commit e2ce8bf

Browse files
committed
feat: Add POS information to tag generation API across C, C++, and JS layers
Extend the tag generation API so each tag carries its part-of-speech alongside the tag text, enabling callers to filter or display tags by POS without re-analyzing the text. C++ API changes: - Introduce TagEntry struct in tag_generator.h with tag (std::string) and pos (core::PartOfSpeech) fields - TagGenerator::generate() and generateFromText() now return std::vector<TagEntry> instead of std::vector<std::string> - Suzume::generateTags() overloads updated accordingly in suzume.h/cpp C API changes (suzume_c.h/cpp): - Add const char** pos field to suzume_tags_t struct alongside existing char** tags and size_t count fields - suzume_generate_tags() and suzume_generate_tags_with_options() populate pos array using posToString() on each TagEntry - suzume_tags_free() correctly frees the pos array to avoid memory leaks JS/WASM API changes (js/index.ts): - Introduce Tag interface with tag: string and pos: string fields - generateTags() return type changed from string[] to Tag[] - parseTags() reads the new pos pointer array from suzume_tags_t layout (field order: tags ptr, pos ptr, count) - Fix memory access: use HEAPU32-derived Uint8Array instead of HEAPU8 (not exported by Emscripten) for struct writes in loadBinaryDictionary and option struct initialization CLI output: - suzume-cli and cmd_analyze now output tag + tab + POS on each line - cmd_test.cpp adapted to extract tag.tag for comparison set WASM test suite refactored: - Remove monolithic suzume.test.ts - Add helpers.ts: shared WASM module loader, allocString, parseMorphemes, parseTags, getTagCount utilities - Add c-api-analyze.test.ts: C API analyze tests covering POS fields, conj fields, mixed POS sentences, and create_with_options - Add c-api-tags.test.ts: C API tag generation tests covering POS filter, max_tags, min_length, and excludeBasic options - Add js-api.test.ts: JS API struct layout compatibility tests C++ unit tests (tag_generator_test.cpp): - Update all tag comparisons from tag == "str" to tag.tag == "str" - Add POS assertions where appropriate (Verb, Adjective, Adverb, Particle, Auxiliary, Noun, Pronoun)
1 parent fd8c064 commit e2ce8bf

16 files changed

Lines changed: 800 additions & 471 deletions

js/index.ts

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,16 @@ export interface Morpheme {
5757
conjForm: string | null;
5858
}
5959

60+
/**
61+
* Tag entry with POS information
62+
*/
63+
export interface Tag {
64+
/** Tag text (surface or lemma) */
65+
tag: string;
66+
/** Part of speech (English) */
67+
pos: string;
68+
}
69+
6070
/**
6171
* Options for tag generation
6272
*/
@@ -246,9 +256,9 @@ export class Suzume {
246256
*
247257
* @param text - UTF-8 encoded Japanese text
248258
* @param options - Optional tag generation options
249-
* @returns Array of tag strings
259+
* @returns Array of tag entries with POS information
250260
*/
251-
generateTags(text: string, options?: TagOptions): string[] {
261+
generateTags(text: string, options?: TagOptions): Tag[] {
252262
const textBytes = this.module.lengthBytesUTF8(text) + 1;
253263
const textPtr = this.module._malloc(textBytes);
254264

@@ -275,13 +285,10 @@ export class Suzume {
275285
const optionsPtr = this.module._malloc(OPTIONS_SIZE);
276286

277287
try {
278-
const heap = (this.module as unknown as { HEAPU8: Uint8Array }).HEAPU8;
279288
const heapU32 = (this.module as unknown as { HEAPU32: Uint32Array }).HEAPU32;
280289

281-
// Zero out the struct first
282-
heap.fill(0, optionsPtr, optionsPtr + OPTIONS_SIZE);
283-
284-
heap[optionsPtr] = posFilter;
290+
// pos_filter is uint8_t at offset 0, padded to 4 bytes — write as uint32
291+
heapU32[optionsPtr >> 2] = posFilter;
285292
heapU32[(optionsPtr + 4) >> 2] = options.excludeBasic ? 1 : 0;
286293
heapU32[(optionsPtr + 8) >> 2] = options.useLemma !== false ? 1 : 0;
287294
heapU32[(optionsPtr + 12) >> 2] = options.minLength ?? 2;
@@ -345,8 +352,10 @@ export class Suzume {
345352
loadBinaryDictionary(data: Uint8Array): boolean {
346353
const dataPtr = this.module._malloc(data.byteLength);
347354
try {
348-
const heap = (this.module as unknown as { HEAPU8: Uint8Array }).HEAPU8;
349-
heap.set(data, dataPtr);
355+
// Derive Uint8Array view from HEAPU32's underlying buffer (HEAPU8 may not be exported)
356+
const heapU32 = (this.module as unknown as { HEAPU32: Uint32Array }).HEAPU32;
357+
const heapU8 = new Uint8Array(heapU32.buffer);
358+
heapU8.set(data, dataPtr);
350359
return this._loadBinaryDict(this.handle, dataPtr, data.byteLength) === 1;
351360
} finally {
352361
this.module._free(dataPtr);
@@ -425,20 +434,26 @@ export class Suzume {
425434
}
426435

427436
// Parse suzume_tags_t structure from WASM memory
428-
private parseTags(tagsPtr: number): string[] {
429-
// suzume_tags_t layout:
430-
// - tags: pointer to char** (4 bytes on wasm32)
431-
// - count: size_t (4 bytes on wasm32)
437+
private parseTags(tagsPtr: number): Tag[] {
438+
// suzume_tags_t layout (wasm32):
439+
// - tags: pointer to char** (4 bytes)
440+
// - pos: pointer to const char** (4 bytes)
441+
// - count: size_t (4 bytes)
432442
const HEAPU32 = (this.module as unknown as { HEAPU32: Uint32Array }).HEAPU32;
433443

434444
const tagsArrayPtr = HEAPU32[tagsPtr >> 2];
435-
const count = HEAPU32[(tagsPtr >> 2) + 1];
445+
const posArrayPtr = HEAPU32[(tagsPtr >> 2) + 1];
446+
const count = HEAPU32[(tagsPtr >> 2) + 2];
436447

437-
const tags: string[] = [];
448+
const tags: Tag[] = [];
438449

439450
for (let idx = 0; idx < count; idx++) {
440451
const tagPtr = HEAPU32[(tagsArrayPtr >> 2) + idx];
441-
tags.push(this.module.UTF8ToString(tagPtr));
452+
const posPtr = HEAPU32[(posArrayPtr >> 2) + idx];
453+
tags.push({
454+
tag: this.module.UTF8ToString(tagPtr),
455+
pos: this.module.UTF8ToString(posPtr),
456+
});
442457
}
443458

444459
return tags;

src/cli/main.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ int main(int argc, char* argv[]) {
9797
// Output tags
9898
auto tags = analyzer.generateTags(text);
9999
for (const auto& tag : tags) {
100-
std::cout << tag << "\n";
100+
std::cout << tag.tag << "\t" << suzume::core::posToString(tag.pos) << "\n";
101101
}
102102
} else {
103103
// Output morpheme analysis

src/postprocess/tag_generator.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,12 @@ std::string TagGenerator::getTagString(const core::Morpheme& morpheme) const {
8888
return morpheme.surface;
8989
}
9090

91-
std::vector<std::string> TagGenerator::generate(
91+
std::vector<TagEntry> TagGenerator::generate(
9292
const std::vector<core::Morpheme>& morphemes) const {
9393
// Post-process morphemes
9494
auto processed = postprocessor_.process(morphemes);
9595

96-
std::vector<std::string> tags;
96+
std::vector<TagEntry> tags;
9797
std::unordered_set<std::string> seen;
9898

9999
for (const auto& morpheme : processed) {
@@ -116,7 +116,7 @@ std::vector<std::string> TagGenerator::generate(
116116
seen.insert(tag);
117117
}
118118

119-
tags.push_back(tag);
119+
tags.push_back({tag, morpheme.pos});
120120

121121
// Check max tags
122122
if (options_.max_tags > 0 && tags.size() >= options_.max_tags) {
@@ -127,7 +127,7 @@ std::vector<std::string> TagGenerator::generate(
127127
return tags;
128128
}
129129

130-
std::vector<std::string> TagGenerator::generateFromText(std::string_view /*text*/) {
130+
std::vector<TagEntry> TagGenerator::generateFromText(std::string_view /*text*/) {
131131
// This would require access to Analyzer
132132
// For now, return empty - caller should use Analyzer + generate()
133133
return {};

src/postprocess/tag_generator.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@ struct TagGeneratorOptions {
3636
bool exclude_basic = false; // Exclude basic words (hiragana-only lemma)
3737
};
3838

39+
/**
40+
* @brief Tag entry with POS information
41+
*/
42+
struct TagEntry {
43+
std::string tag;
44+
core::PartOfSpeech pos;
45+
};
46+
3947
/**
4048
* @brief Tag generator from morphemes
4149
*/
@@ -47,17 +55,17 @@ class TagGenerator {
4755
/**
4856
* @brief Generate tags from morphemes
4957
* @param morphemes Input morphemes
50-
* @return Vector of tag strings
58+
* @return Vector of tag entries with POS information
5159
*/
52-
std::vector<std::string> generate(
60+
std::vector<TagEntry> generate(
5361
const std::vector<core::Morpheme>& morphemes) const;
5462

5563
/**
5664
* @brief Generate tags from text using analyzer
5765
* @param text Input text
58-
* @return Vector of tag strings
66+
* @return Vector of tag entries with POS information
5967
*/
60-
static std::vector<std::string> generateFromText(std::string_view text);
68+
static std::vector<TagEntry> generateFromText(std::string_view text);
6169

6270
private:
6371
TagGeneratorOptions options_;

src/suzume-cli/cmd_analyze.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ void outputMorpheme(const std::vector<core::Morpheme>& morphemes) {
3939
}
4040
}
4141

42-
void outputTags(const std::vector<std::string>& tags) {
42+
void outputTags(const std::vector<postprocess::TagEntry>& tags) {
4343
for (const auto& tag : tags) {
44-
std::cout << tag << "\n";
44+
std::cout << tag.tag << "\t" << core::posToString(tag.pos) << "\n";
4545
}
4646
}
4747

src/suzume-cli/cmd_test.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@ std::vector<std::string> split(const std::string& str, char delim) {
3434
bool runSingleTest(Suzume& analyzer, const std::string& input,
3535
const std::set<std::string>& expected, bool verbose) {
3636
auto tags = analyzer.generateTags(input);
37-
std::set<std::string> actual(tags.begin(), tags.end());
37+
std::set<std::string> actual;
38+
for (const auto& t : tags) {
39+
actual.insert(t.tag);
40+
}
3841

3942
bool passed = (actual == expected);
4043

src/suzume.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,12 +179,12 @@ std::vector<core::Morpheme> Suzume::analyzeDebug(std::string_view text,
179179
return impl_->postprocessor.process(morphemes);
180180
}
181181

182-
std::vector<std::string> Suzume::generateTags(std::string_view text) const {
182+
std::vector<postprocess::TagEntry> Suzume::generateTags(std::string_view text) const {
183183
auto morphemes = impl_->analyzer.analyze(text);
184184
return impl_->tag_generator.generate(morphemes);
185185
}
186186

187-
std::vector<std::string> Suzume::generateTags(
187+
std::vector<postprocess::TagEntry> Suzume::generateTags(
188188
std::string_view text,
189189
const postprocess::TagGeneratorOptions& options) const {
190190
auto morphemes = impl_->analyzer.analyze(text);

src/suzume.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,17 +100,17 @@ class Suzume {
100100
/**
101101
* @brief Generate tags from text
102102
* @param text UTF-8 encoded Japanese text
103-
* @return Vector of tag strings
103+
* @return Vector of tag entries with POS information
104104
*/
105-
std::vector<std::string> generateTags(std::string_view text) const;
105+
std::vector<postprocess::TagEntry> generateTags(std::string_view text) const;
106106

107107
/**
108108
* @brief Generate tags from text with custom options
109109
* @param text UTF-8 encoded Japanese text
110110
* @param options Tag generation options (POS filter, exclude_basic, etc.)
111-
* @return Vector of tag strings
111+
* @return Vector of tag entries with POS information
112112
*/
113-
std::vector<std::string> generateTags(
113+
std::vector<postprocess::TagEntry> generateTags(
114114
std::string_view text,
115115
const postprocess::TagGeneratorOptions& options) const;
116116

src/suzume_c.cpp

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -168,14 +168,22 @@ SUZUME_EXPORT suzume_tags_t* suzume_generate_tags(suzume_t handle,
168168

169169
if (result->count == 0) {
170170
result->tags = nullptr;
171+
result->pos = nullptr;
171172
return result;
172173
}
173174

174175
result->tags = new char*[result->count];
176+
result->pos = new const char*[result->count];
175177

176178
for (size_t idx = 0; idx < result->count; ++idx) {
177-
result->tags[idx] = new char[tags[idx].size() + 1];
178-
std::strcpy(result->tags[idx], tags[idx].c_str());
179+
result->tags[idx] = new char[tags[idx].tag.size() + 1];
180+
std::strcpy(result->tags[idx], tags[idx].tag.c_str());
181+
182+
auto pos_str = suzume::core::posToString(tags[idx].pos);
183+
auto* pos_copy = new char[pos_str.size() + 1];
184+
std::memcpy(pos_copy, pos_str.data(), pos_str.size());
185+
pos_copy[pos_str.size()] = '\0';
186+
result->pos[idx] = pos_copy;
179187
}
180188

181189
return result;
@@ -205,14 +213,22 @@ SUZUME_EXPORT suzume_tags_t* suzume_generate_tags_with_options(
205213

206214
if (result->count == 0) {
207215
result->tags = nullptr;
216+
result->pos = nullptr;
208217
return result;
209218
}
210219

211220
result->tags = new char*[result->count];
221+
result->pos = new const char*[result->count];
212222

213223
for (size_t idx = 0; idx < result->count; ++idx) {
214-
result->tags[idx] = new char[tags[idx].size() + 1];
215-
std::strcpy(result->tags[idx], tags[idx].c_str());
224+
result->tags[idx] = new char[tags[idx].tag.size() + 1];
225+
std::strcpy(result->tags[idx], tags[idx].tag.c_str());
226+
227+
auto pos_str = suzume::core::posToString(tags[idx].pos);
228+
auto* pos_copy = new char[pos_str.size() + 1];
229+
std::memcpy(pos_copy, pos_str.data(), pos_str.size());
230+
pos_copy[pos_str.size()] = '\0';
231+
result->pos[idx] = pos_copy;
216232
}
217233

218234
return result;
@@ -233,6 +249,14 @@ SUZUME_EXPORT void suzume_tags_free(suzume_tags_t* tags) {
233249
delete[] tags->tags;
234250
}
235251

252+
if (tags->pos != nullptr) {
253+
for (size_t idx = 0; idx < tags->count; ++idx) {
254+
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
255+
delete[] const_cast<char*>(tags->pos[idx]);
256+
}
257+
delete[] tags->pos;
258+
}
259+
236260
delete tags;
237261
}
238262

src/suzume_c.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,9 @@ typedef struct {
5252
* @brief Tag generation result structure
5353
*/
5454
typedef struct {
55-
char** tags; /**< Array of tag strings */
56-
size_t count; /**< Number of tags */
55+
char** tags; /**< Array of tag strings */
56+
const char** pos; /**< Array of POS strings (English, e.g. "NOUN", "VERB") */
57+
size_t count; /**< Number of tags */
5758
} suzume_tags_t;
5859

5960
/**

0 commit comments

Comments
 (0)