lucene创建索引

2021-06-30 18:04

阅读:519

标签:lucene创建索引

1.导入jar包

技术分享


2.创建实体Bean

package com.zhishang.lucene;

/**
 * Created by Administrator on 2017/7/8.
 */
public class HtmlBean {
    private String title;
    private String content;
    private String url;

    public void setTitle(String title) {
        this.title = title;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getTitle() {
        return title;
    }

    public String getContent() {
        return content;
    }

    public String getUrl() {
        return url;
    }
}


3.创建工具Bean

package com.zhishang.lucene;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import org.junit.Test;

import java.io.File;
import java.io.IOException;

/**
 * Created by Administrator on 2017/7/8.
 */
public class HtmlBeanUtil {


    public static HtmlBean parseHtml(File file){
        try {
            Source sc = new Source(file);
            Element element = sc.getFirstElement(HTMLElementName.TITLE);
            if (element == null || element.getTextExtractor() == null){
                return null;
            }

            HtmlBean htmlBean = new HtmlBean();
            htmlBean.setTitle(element.getTextExtractor().toString());
            htmlBean.setContent(sc.getTextExtractor().toString());
            htmlBean.setUrl(file.getAbsolutePath());

            return htmlBean;
        } catch (IOException e) {
            e.printStackTrace();
        }

        return null;
    }
}


4.创建操作Bean

package com.zhishang.lucene;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.File;
import java.io.IOException;
import java.util.Collection;

/**
 * Created by Administrator on 2017/7/7.
 */
public class CreateIndex {
    public static final String indexDir = "G:/index";
    public static final String dataDir = "G:/data";

    public void createIndex(){
        try {
            Directory dir = FSDirectory.open(new File(indexDir));
            //分词器
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9);
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9,analyzer);
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
            IndexWriter writer = new IndexWriter(dir,config);
            File file = new File(dataDir);

            RAMDirectory ramdir = new RAMDirectory();
            Analyzer analyzer1 = new IKAnalyzer();
            IndexWriterConfig config1 = new IndexWriterConfig(Version.LUCENE_4_9,analyzer1);
            IndexWriter ramWriter = new IndexWriter(ramdir,config1);

            Collection files = FileUtils.listFiles(file, TrueFileFilter.INSTANCE,TrueFileFilter.INSTANCE);
            int count = 0;
            for(File f:files){
                HtmlBean bean =  HtmlBeanUtil.parseHtml(f);
                if(bean != null){
                    Document document = new Document();
                    document.add(new StringField("title",bean.getTitle(), Field.Store.YES));
                    document.add(new TextField("content",bean.getContent(), Field.Store.YES));
                    document.add(new StringField("url",bean.getUrl(), Field.Store.YES));
                    ramWriter.addDocument(document);
                    count++;
                    if (count == 50){
                        ramWriter.close();
                        writer.addIndexes(ramdir);
                        ramdir = new RAMDirectory();
                        Analyzer analyzer2 = new IKAnalyzer();
                        IndexWriterConfig config2 = new IndexWriterConfig(Version.LUCENE_4_9,analyzer2);
                        ramWriter = new IndexWriter(ramdir,config2);
                        count = 0;
                    }

                }
            }
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}


5.创建测试Bean

package com.zhishang.lucene;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;

import java.io.File;

/**
 * Created by Administrator on 2017/7/8.
 */
public class LuceneBean {

    /*
    创建索引
     */
    @Test
    public void createIndex(){
        File file = new File(CreateIndex.indexDir);
        if (file.exists()){
            file.delete();
            file.mkdirs();
        }
        CreateIndex createIndex = new CreateIndex();
        createIndex.createIndex();
    }
}


6.查看生成的索引文件

技术分享

本文出自 “素颜” 博客,请务必保留此出处http://suyanzhu.blog.51cto.com/8050189/1945466

lucene创建索引

标签:lucene创建索引

原文地址:http://suyanzhu.blog.51cto.com/8050189/1945466

上一篇:jericho解析html

下一篇:reset.css


评论


亲,登录后才可以留言!