打开APP
userphoto
未登录

开通VIP,畅享免费电子书等14项超值服

开通VIP
solr使用教程四【面试+工作】

<!-- Finnish -->

    <fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball"

                enablePositionIncrements="true"/>

        <filter class="solr.SnowballPorterFilterFactory" language="Finnish"/>

        <!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> -->

      </analyzer>

    </fieldType>

    <!-- French -->

    <fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <!-- removes l', etc -->

        <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball"

                enablePositionIncrements="true"/>

        <filter class="solr.FrenchLightStemFilterFactory"/>

        <!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> -->

        <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> -->

      </analyzer>

    </fieldType>

    <!-- Irish -->

    <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <!-- removes d', etc -->

        <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>

        <!-- removes n-, etc. position increments is intentionally false! -->

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt"

                enablePositionIncrements="false"/>

        <filter class="solr.IrishLowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt"

                enablePositionIncrements="true"/>

        <filter class="solr.SnowballPorterFilterFactory" language="Irish"/>

      </analyzer>

    </fieldType>

    <!-- Galician -->

    <fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt"

                enablePositionIncrements="true"/>

        <filter class="solr.GalicianStemFilterFactory"/>

        <!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> -->

      </analyzer>

    </fieldType>

    <!-- Hindi -->

    <fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <!-- normalizes unicode representation -->

        <filter class="solr.IndicNormalizationFilterFactory"/>

        <!-- normalizes variation in spelling -->

        <filter class="solr.HindiNormalizationFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt"

                enablePositionIncrements="true"/>

        <filter class="solr.HindiStemFilterFactory"/>

      </analyzer>

    </fieldType>

    <!-- Hungarian -->

    <fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball"

                enablePositionIncrements="true"/>

        <filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>

        <!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> -->

      </analyzer>

    </fieldType>

    <!-- Armenian -->

    <fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt"

                enablePositionIncrements="true"/>

        <filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>

      </analyzer>

    </fieldType>

    <!-- Indonesian -->

    <fieldType name="text_id" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt"

                enablePositionIncrements="true"/>

        <!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false -->

        <filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>

      </analyzer>

    </fieldType>

    <!-- Italian -->

    <fieldType name="text_it" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <!-- removes l', etc -->

        <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball"

                enablePositionIncrements="true"/>

        <filter class="solr.ItalianLightStemFilterFactory"/>

        <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> -->

      </analyzer>

    </fieldType>

    <!-- Japanese using morphological analysis (see text_cjk for a configuration using bigramming)

         NOTE: If you want to optimize search for precision, use default operator AND in your query

         parser config with <solrQueryParser defaultOperator="AND"/> further down in this file.  Use

         OR if you would like to optimize for recall (default).

    -->

    <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">

      <analyzer>

        <!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer)

             Kuromoji has a search mode (default) that does segmentation useful for search.  A heuristic

             is used to segment compounds into its parts and the compound itself is kept as synonym.

             Valid values for attribute mode are:

                normal: regular segmentation

                search: segmentation useful for search with synonyms compounds (default)

              extended: same as search mode, but unigrams unknown words (experimental)

             For some applications it might be good to use search mode for indexing and normal mode for

             queries to reduce recall and prevent parts of compounds from being matched and highlighted.

             Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query.

             Kuromoji also has a convenient user dictionary feature that allows overriding the statistical

             model with your own entries for segmentation, part-of-speech tags and readings without a need

             to specify weights.  Notice that user dictionaries have not been subject to extensive testing.

             User dictionary attributes are:

                       userDictionary: user dictionary filename

               userDictionaryEncoding: user dictionary encoding (default is UTF-8)

             See lang/userdict_ja.txt for a sample user dictionary file.

             Punctuation characters are discarded by default.  Use discardPunctuation="false" to keep them.

             See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.

          -->

        <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>

        <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->

        <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->

        <filter class="solr.JapaneseBaseFormFilterFactory"/>

        <!-- Removes tokens with certain part-of-speech tags -->

        <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt"

                enablePositionIncrements="true"/>

        <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->

        <filter class="solr.CJKWidthFilterFactory"/>

        <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt"

                enablePositionIncrements="true"/>

        <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->

        <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>

        <!-- Lower-cases romaji characters -->

        <filter class="solr.LowerCaseFilterFactory"/>

      </analyzer>

    </fieldType>

    <!-- Latvian -->

    <fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt"

                enablePositionIncrements="true"/>

        <filter class="solr.LatvianStemFilterFactory"/>

      </analyzer>

    </fieldType>

    <!-- Dutch -->

    <fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball"

                enablePositionIncrements="true"/>

        <filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>

        <filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>

      </analyzer>

    </fieldType>

    <!-- Norwegian -->

    <fieldType name="text_no" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball"

                enablePositionIncrements="true"/>

        <filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>

        <!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->

        <!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->

      </analyzer>

    </fieldType>

    <!-- Portuguese -->

    <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball"

                enablePositionIncrements="true"/>

        <filter class="solr.PortugueseLightStemFilterFactory"/>

        <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->

        <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->

        <!-- most aggressive: <filter class="solr.PortugueseStemFilterFactory"/> -->

      </analyzer>

    </fieldType>

    <!-- Romanian -->

    <fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt"

                enablePositionIncrements="true"/>

        <filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>

      </analyzer>

    </fieldType>

    <!-- Russian -->

    <fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball"

                enablePositionIncrements="true"/>

        <filter class="solr.SnowballPorterFilterFactory" language="Russian"/>

        <!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> -->

      </analyzer>

    </fieldType>

    <!-- Swedish -->

    <fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball"

                enablePositionIncrements="true"/>

        <filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>

        <!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> -->

      </analyzer>

    </fieldType>

    <!-- Thai -->

    <fieldType name="text_th" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.LowerCaseFilterFactory"/>

        <filter class="solr.ThaiWordFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt"

                enablePositionIncrements="true"/>

      </analyzer>

    </fieldType>

    <!-- Turkish -->

    <fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100">

      <analyzer>

        <tokenizer class="solr.StandardTokenizerFactory"/>

        <filter class="solr.TurkishLowerCaseFilterFactory"/>

        <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt"

                enablePositionIncrements="true"/>

        <filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>

      </analyzer>

    </fieldType>

  </types>

</schema>

本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请点击举报
打开APP,阅读全文并永久保存 查看更多类似文章
猜你喜欢
类似文章
【热】打开小程序,算一算2024你的财运
Solr基础知识二(导入数据)
solr 5.5.1安装并配置中文分词IKAnalyzer
关于solr不能搜索中文
Solr与tomcat整合,并添加中文分词器
Solr配置文件
【solr基础教程之二】索引
更多类似文章 >>
生活服务
热点新闻
分享 收藏 导长图 关注 下载文章
绑定账号成功
后续可登录账号畅享VIP特权!
如果VIP功能使用有故障,
可点击这里联系客服!

联系客服