stepbystep:baseservices

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
stepbystep:baseservices [2021/03/29 15:11]
giancarlo [Cantaloupe]
stepbystep:baseservices [2021/07/15 16:23] (current)
giancarlo [SOLR]
Line 438: Line 438:
 \\ \\
 \\ \\
-Update Cantaloupe to 5.0 as this version was released.+[[#cantaupdate5|Update Cantaloupe to 5.0 as this version was released.]] 
 +<wrap #cantaupdate5></wrap>
 <code bash> <code bash>
 wget https://github.com/cantaloupe-project/cantaloupe/releases/download/v5.0/cantaloupe-5.0.zip wget https://github.com/cantaloupe-project/cantaloupe/releases/download/v5.0/cantaloupe-5.0.zip
Line 583: Line 584:
 sudo mv target/solr-ocrhighlighting-0.6.0-SNAPSHOT.jar /opt/solr/contrib/archipelago/lib/ sudo mv target/solr-ocrhighlighting-0.6.0-SNAPSHOT.jar /opt/solr/contrib/archipelago/lib/
 </code> </code>
 +We need last ocrhighlighting from master (0.7.0 SNAPSHOT)
 +<code bash>
 +$ wget https://github.com/dbmdz/solr-ocrhighlighting/archive/main.zip
 +$ unzip main.zip
 +$ cd solr-ocrhighlighting-main/
 +$ JAVA_HOME=/usr/lib/jvm/java-14-openjdk-amd64 mvn package
 +$ cd ..
 +$ sudo mv /opt/solr/contrib/archipelago/lib/solr-ocrhighlighting-0.6.0-SNAPSHOT.jar ./
 +$ sudo mv solr-ocrhighlighting-main/target/solr-ocrhighlighting-0.7.0-SNAPSHOT.jar /opt/solr/contrib/archipelago/lib/
 +$ rm -R solr-ocrhighlighting-main
 +
 +Reload Solr
 +</code>
 +Update to ocrhighlighting 0.7.0 release
 +<code bash>
 +$ wget https://github.com/dbmdz/solr-ocrhighlighting/archive/refs/tags/0.7.0.zip
 +$ unzip 0.7.0.zip
 +$ cd solr-ocrhighlighting-0.7.0
 +$ JAVA_HOME=/usr/lib/jvm/java-14-openjdk-amd64 mvn package
 +$ cd ..
 +$ sudo mv /opt/solr/contrib/archipelago/lib/solr-ocrhighlighting-0.7.0-SNAPSHOT.jar ./
 +$ sudo mv solr-ocrhighlighting-0.7.0/target/solr-ocrhighlighting-0.7.0.jar /opt/solr/contrib/archipelago/lib/
 +$ rm -R solr-ocrhighlighting-0.7.0
 +
 +Reload Solr
 +</code>
 +
 Create archipelago core Create archipelago core
 <code bash> <code bash>
Line 608: Line 636:
 $ sudo systemctl start solr $ sudo systemctl start solr
 </code> </code>
-Edit Solr filters for Italian text language+[[#solrita|Edit Solr filters for Italian text language]] 
 +<wrap #solrita></wrap>
 <code bash> <code bash>
 $ sudo -u solr nano -w /srv/solr/data/archipelago/conf/schema_extra_types.xml $ sudo -u solr nano -w /srv/solr/data/archipelago/conf/schema_extra_types.xml
Line 620: Line 649:
     <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/>     <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/>
     <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/>     <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/>
 +    <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" />
     <filter class="solr.SnowballPorterFilterFactory" language="Italian"/>     <filter class="solr.SnowballPorterFilterFactory" language="Italian"/>
-    <filter class="solr.LengthFilterFactory" min="2" max="100"/> +    <filter class="solr.LengthFilterFactory" min="3" max="100"/>
-    <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>+
   </analyzer>   </analyzer>
   <analyzer type="query">   <analyzer type="query">
Line 630: Line 659:
     <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/>     <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/>
     <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/>     <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/>
 +    <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" />
     <filter class="solr.SnowballPorterFilterFactory" language="Italian"/>     <filter class="solr.SnowballPorterFilterFactory" language="Italian"/>
-    <filter class="solr.LengthFilterFactory" min="2" max="100"/> +    <filter class="solr.LengthFilterFactory" min="3" max="100"/>
-    <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>+
   </analyzer>   </analyzer>
 </fieldType> </fieldType>
 +
  
 <fieldType name="text_ocr_stored" class="solr.TextField" storeOffsetsWithPositions="true" termVectors="true"> <fieldType name="text_ocr_stored" class="solr.TextField" storeOffsetsWithPositions="true" termVectors="true">
   <analyzer type="index">   <analyzer type="index">
     <charFilter class="de.digitalcollections.solrocr.lucene.filters.OcrCharFilterFactory"/>     <charFilter class="de.digitalcollections.solrocr.lucene.filters.OcrCharFilterFactory"/>
 +    <charFilter class="solr.MappingCharFilterFactory" mapping="accents_und.txt"/>
     <tokenizer class="solr.StandardTokenizerFactory"/>     <tokenizer class="solr.StandardTokenizerFactory"/>
     <filter class="solr.LowerCaseFilterFactory"/>     <filter class="solr.LowerCaseFilterFactory"/>
     <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/>     <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/>
     <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/>     <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/>
 +    <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" />
     <filter class="solr.SnowballPorterFilterFactory" language="Italian"/>     <filter class="solr.SnowballPorterFilterFactory" language="Italian"/>
-    <filter class="solr.ASCIIFoldingFilterFactorypreserveOriginal="false" />+    <filter class="solr.LengthFilterFactorymin="3" max="100"/>
   </analyzer>   </analyzer>
   <analyzer type="query">   <analyzer type="query">
 +    <charFilter class="solr.MappingCharFilterFactory" mapping="accents_und.txt"/>
     <tokenizer class="solr.StandardTokenizerFactory"/>     <tokenizer class="solr.StandardTokenizerFactory"/>
     <filter class="solr.LowerCaseFilterFactory"/>     <filter class="solr.LowerCaseFilterFactory"/>
     <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/>     <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/>
     <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/>     <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/>
 +    <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" />
     <filter class="solr.SnowballPorterFilterFactory" language="Italian"/>     <filter class="solr.SnowballPorterFilterFactory" language="Italian"/>
-    <filter class="solr.ASCIIFoldingFilterFactorypreserveOriginal="false" />+    <filter class="solr.LengthFilterFactorymin="3" max="100"/>
   </analyzer>   </analyzer>
 </fieldType> </fieldType>
 +</code>
 +[[#solrsugg|Edit Solr schema for Suggester split on word without punctuation]]
 +<wrap #solrsugg></wrap>
 +<code bash>
 +$ sudo -u solr nano -w /srv/solr/data/archipelago/conf/schema.xml
 +</code>
 +<code xml>
 +    <fieldType name="text_ws" class="solr.TextField" omitNorms="true" positionIncrementGap="100">
 +      <analyzer>
 +<!--        <tokenizer class="solr.WhitespaceTokenizerFactory"/> -->
 +        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
 +        <filter class="solr.LowerCaseFilterFactory"/>
 +      </analyzer>
 +    </fieldType>
 </code> </code>
  • stepbystep/baseservices.1617023516.txt.gz
  • Last modified: 2021/03/29 15:11
  • by giancarlo