From e1c2697fe1f0dc1dfa6ba72ca864348b0721eec7 Mon Sep 17 00:00:00 2001 From: syuilo Date: Wed, 6 Sep 2017 19:41:36 +0900 Subject: [PATCH] wip --- package.json | 2 + src/tools/ai/categorizer.ts | 89 +++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 src/tools/ai/categorizer.ts diff --git a/package.json b/package.json index a2896f4c7..ae959d1b1 100644 --- a/package.json +++ b/package.json @@ -97,6 +97,7 @@ "accesses": "2.5.0", "animejs": "2.0.2", "autwh": "0.0.1", + "bayes": "0.0.7", "bcryptjs": "2.4.3", "body-parser": "1.17.2", "cafy": "2.4.0", @@ -120,6 +121,7 @@ "is-root": "1.0.0", "is-url": "1.2.2", "js-yaml": "3.9.1", + "mecab-async": "^0.1.0", "mongodb": "2.2.31", "monk": "6.0.3", "morgan": "1.8.2", diff --git a/src/tools/ai/categorizer.ts b/src/tools/ai/categorizer.ts new file mode 100644 index 000000000..f70ce1b7d --- /dev/null +++ b/src/tools/ai/categorizer.ts @@ -0,0 +1,89 @@ +import * as fs from 'fs'; +const bayes = require('bayes'); +const MeCab = require('mecab-async'); +import Post from '../../api/models/post'; + +export default class Categorizer { + classifier: any; + categorizerDbFilePath: string; + mecab: any; + + constructor(categorizerDbFilePath: string, mecabCommand: string = 'mecab -d /usr/share/mecab/dic/mecab-ipadic-neologd') { + this.categorizerDbFilePath = categorizerDbFilePath; + + this.mecab = new MeCab(); + this.mecab.command = mecabCommand; + + // BIND ----------------------------------- + this.tokenizer = this.tokenizer.bind(this); + } + + tokenizer(text: string) { + return this.mecab.wakachiSync(text); + } + + async init() { + try { + const db = fs.readFileSync(this.categorizerDbFilePath, { + encoding: 'utf8' + }); + + this.classifier = bayes.fromJson(db); + this.classifier.tokenizer = this.tokenizer; + } catch(e) { + this.classifier = bayes({ + tokenizer: this.tokenizer + }); + + // 訓練データ + const verifiedPosts = await Post.find({ + is_category_verified: true + }); + + // 学習 + verifiedPosts.forEach(post => { + this.classifier.learn(post.text, post.category); + }); + + this.save(); + } + } + + async learn(id, category) { + const post = await Post.findOne({ _id: id }); + + Post.update({ _id: id }, { + $set: { + category: category, + is_category_verified: true + } + }); + + this.classifier.learn(post.text, category); + + this.save(); + } + + async categorize(id) { + const post = await Post.findOne({ _id: id }); + + const category = this.classifier.categorize(post.text); + + Post.update({ _id: id }, { + $set: { + category: category + } + }); + } + + async test(text) { + return this.classifier.categorize(text); + } + + save() { + fs.writeFileSync(this.categorizerDbFilePath, this.classifier.toJson(), { + encoding: 'utf8' + }); + } +} +