[PC] Lucene.Net Ver. 4.8.0 beta-00005 ～Lucene.Netでインデックス作成してみる～

Lucene.Netでインデックス作成

Lucene.NetがJapaneseAnalyzerに対応したということで、早速インデックスの作成プログラムを書いてみました。まずは、Visual Studio Community 2017 (以下、VS2017)の設定方法から説明します。

プロジェクトファイル作成

VS2017のメニューの[ファイル] > [新規作成] > [プロジェクト]から、.NET Frameworkのコンソールアプリを選択し、名前を付けて保存します。

次に、[ツール] > [NuGet パッケージマネージャー] > [ソリューションのNuGetパッケージの管理]を選択し、NuGetのパッケージ管理画面を表示します。検索画面で、「Lucene.Net」と入力し、且つ[プレリリースを含める]チェックボックスをオンにして、検索します。

Lucene.Netのv4.8.0-beta-00005を選択します。そして、インストールする先ほど作成したプロジェクトを選択して、[インストール]ボタンをクリックします。

同様にして、Lucene.Net.Analysis.Kuromojiをインストールします。

Commonも同時にインストールすかと問い合わせが表示されるため、[OK]をクリックして合わせて入れておきます。

無事にCommonとKuromojiとがインストールされるのを確認します。

コーディング

次は、コーディングなのですが、FlexLuceneで使ったものを使いまわします。もちろんusing Lucene.Net.*という形になります。また、JapaneseAnalyzerやIndexWriterConfigの引数が古い形なのでバージョンを指定してあげる必要があります。LuceneVersion.LUCENE_48をつけます。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using Lucene.Net.Analysis.Ja;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Util;

namespace LdNetIndex
{
    class Program
    {
        const string TARGETDIR = @"C:\temp\testdir"; //ソースとなるフォルダー
        const string INDEXDIR = @"c:\temp\ldn-index";　//インデックスの場所
        const string FIELD_TITLE = "title";
        const string FIELD_PLACE = "place";
        const string FIELD_CONTENT = "content";
        const string FIELD_MODIFIED = "modified";

        static void Main(string[] args)
        {
            FSDirectory dir = FSDirectory.Open(INDEXDIR);

            // テキストの解析方法（アナライザー）を定義
            JapaneseAnalyzer analyzer = new JapaneseAnalyzer(LuceneVersion.LUCENE_48);
            IndexWriterConfig config = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer);
            config.OpenMode = OpenMode.CREATE_OR_APPEND;
            IndexWriter writer = new IndexWriter(dir, config);
            IndexReader reader = DirectoryReader.Open(writer, false);

            //開始時間の取得
            DateTime startDt = DateTime.Now;

            //ファイルのフルパスを基にhtmlファイルを読み込む
            IndexFolder(ref writer, ref reader, TARGETDIR, "*.htm*");

            //インデクサのクローズ
            writer.Commit();

            //終了時間の取得
            DateTime endDt = DateTime.Now;
            System.Console.WriteLine("{0}タイマ刻み数かかりました", (endDt - startDt).Ticks);
        }

        private static void IndexFolder(ref IndexWriter writer, ref IndexReader reader, string srcDirName, string filterString)
        {
            string[] files = System.IO.Directory.GetFiles(srcDirName, filterString, System.IO.SearchOption.TopDirectoryOnly);
            try
            {

                foreach (string file in files)
                {
                    string title, content, f;
                    title = "";
                    content = "";
                    f = file;
                    HTMLParse(ref title, ref content, ref f);

                    //文書の解析
                    Document doc = new Document();

                    Field fldTitle = new StringField(FIELD_TITLE, title, Field.Store.YES);
                    Field fldPlace = new StringField(FIELD_PLACE, f, Field.Store.YES);
                    Field sfModified = new SortedDocValuesField(FIELD_MODIFIED, new BytesRef(System.IO.File.GetLastWriteTime(f).ToString()));
                    Field fldModified = new StoredField(FIELD_MODIFIED, System.IO.File.GetLastWriteTime(f).ToString());
                    Field fldContent = new TextField(FIELD_CONTENT, content, Field.Store.YES);
                    doc.Add(fldTitle);
                    doc.Add(sfModified);
                    doc.Add(fldModified);
                    doc.Add(fldPlace);
                    doc.Add(fldContent);
                    Term tterm = new Term("place", f.ToString());
                    writer.UpdateDocument(tterm, doc);
                    //writer.AddDocument(doc);
                }
            }
            catch (System.IO.IOException ex)
            {
                System.Console.WriteLine(ex.ToString());
            }


            //コピー元のディレクトリにあるディレクトリについて、再帰的に呼び出す
            string[] dirs = System.IO.Directory.GetDirectories(srcDirName);
            foreach (string dir in dirs)
            {
                IndexFolder(ref writer, ref reader, dir, filterString);
            }

        }

        /*************************
         * HTMLの解析
         *************************/
        private static void HTMLParse(ref string title, ref string content, ref string fileName)
        {
            StreamReader sr = new StreamReader(fileName, Encoding.GetEncoding("Shift_JIS"));
            string text = sr.ReadToEnd();
            //正規表現パターンとオプションを指定してRegexオブジェクトを作成 
            Regex rTitle = new Regex("<title[^>]*>(.*?)", RegexOptions.IgnoreCase | RegexOptions.Singleline);
            Regex rPre = new Regex("<body[^>]*>(.*?)", RegexOptions.IgnoreCase | RegexOptions.Singleline);

            //TextBox1.Text内で正規表現と一致する対象をすべて検索 
            MatchCollection mcTitle = rTitle.Matches(text);
            MatchCollection mcPre = rPre.Matches(text);

            foreach (Match m in mcTitle)
            {
                title = m.Groups[1].Value;
            }
            foreach (Match m in mcPre)
            {
                content = m.Groups[1].Value;
            }
        }

若干、記載が古い形式なので、config.OpenMode = OpenMode.CREATE_OR_APPEND;とかは、直接値をセットする形になっています。FlexLuceneのバージョン6.3.0を使った場合は、Get/Setで入力していたのと少し違います。ただ、ほんの少しの手直しで動作させることができました。

まとめ

Lucene.Net v4.8.0 beta-00005でJapaneseAnalyzerがサポートされたことで、いろいろな展開ができるようになると思います。ゆくゆくはAndroidアプリ(Xarmin)に利用できないかと夢は膨らむばかりです。是非みなさんもLucene.Netを使った全文検索をお試しください。