Both sides previous revision
Previous revision
Next revision
|
Previous revision
|
stepbystep:baseservices [2021/03/29 15:11] giancarlo [Cantaloupe] |
stepbystep:baseservices [2021/07/15 16:23] (current) giancarlo [SOLR] |
\\ | \\ |
\\ | \\ |
Update Cantaloupe to 5.0 as this version was released. | [[#cantaupdate5|Update Cantaloupe to 5.0 as this version was released.]] |
| <wrap #cantaupdate5></wrap> |
<code bash> | <code bash> |
wget https://github.com/cantaloupe-project/cantaloupe/releases/download/v5.0/cantaloupe-5.0.zip | wget https://github.com/cantaloupe-project/cantaloupe/releases/download/v5.0/cantaloupe-5.0.zip |
sudo mv target/solr-ocrhighlighting-0.6.0-SNAPSHOT.jar /opt/solr/contrib/archipelago/lib/ | sudo mv target/solr-ocrhighlighting-0.6.0-SNAPSHOT.jar /opt/solr/contrib/archipelago/lib/ |
</code> | </code> |
| We need last ocrhighlighting from master (0.7.0 SNAPSHOT) |
| <code bash> |
| $ wget https://github.com/dbmdz/solr-ocrhighlighting/archive/main.zip |
| $ unzip main.zip |
| $ cd solr-ocrhighlighting-main/ |
| $ JAVA_HOME=/usr/lib/jvm/java-14-openjdk-amd64 mvn package |
| $ cd .. |
| $ sudo mv /opt/solr/contrib/archipelago/lib/solr-ocrhighlighting-0.6.0-SNAPSHOT.jar ./ |
| $ sudo mv solr-ocrhighlighting-main/target/solr-ocrhighlighting-0.7.0-SNAPSHOT.jar /opt/solr/contrib/archipelago/lib/ |
| $ rm -R solr-ocrhighlighting-main |
| |
| Reload Solr |
| </code> |
| Update to ocrhighlighting 0.7.0 release |
| <code bash> |
| $ wget https://github.com/dbmdz/solr-ocrhighlighting/archive/refs/tags/0.7.0.zip |
| $ unzip 0.7.0.zip |
| $ cd solr-ocrhighlighting-0.7.0 |
| $ JAVA_HOME=/usr/lib/jvm/java-14-openjdk-amd64 mvn package |
| $ cd .. |
| $ sudo mv /opt/solr/contrib/archipelago/lib/solr-ocrhighlighting-0.7.0-SNAPSHOT.jar ./ |
| $ sudo mv solr-ocrhighlighting-0.7.0/target/solr-ocrhighlighting-0.7.0.jar /opt/solr/contrib/archipelago/lib/ |
| $ rm -R solr-ocrhighlighting-0.7.0 |
| |
| Reload Solr |
| </code> |
| |
Create archipelago core | Create archipelago core |
<code bash> | <code bash> |
$ sudo systemctl start solr | $ sudo systemctl start solr |
</code> | </code> |
Edit Solr filters for Italian text language | [[#solrita|Edit Solr filters for Italian text language]] |
| <wrap #solrita></wrap> |
<code bash> | <code bash> |
$ sudo -u solr nano -w /srv/solr/data/archipelago/conf/schema_extra_types.xml | $ sudo -u solr nano -w /srv/solr/data/archipelago/conf/schema_extra_types.xml |
<filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> | <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> |
| <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" /> |
<filter class="solr.SnowballPorterFilterFactory" language="Italian"/> | <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> |
<filter class="solr.LengthFilterFactory" min="2" max="100"/> | <filter class="solr.LengthFilterFactory" min="3" max="100"/> |
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/> | |
</analyzer> | </analyzer> |
<analyzer type="query"> | <analyzer type="query"> |
<filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> | <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> |
| <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" /> |
<filter class="solr.SnowballPorterFilterFactory" language="Italian"/> | <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> |
<filter class="solr.LengthFilterFactory" min="2" max="100"/> | <filter class="solr.LengthFilterFactory" min="3" max="100"/> |
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/> | |
</analyzer> | </analyzer> |
</fieldType> | </fieldType> |
| |
| |
<fieldType name="text_ocr_stored" class="solr.TextField" storeOffsetsWithPositions="true" termVectors="true"> | <fieldType name="text_ocr_stored" class="solr.TextField" storeOffsetsWithPositions="true" termVectors="true"> |
<analyzer type="index"> | <analyzer type="index"> |
<charFilter class="de.digitalcollections.solrocr.lucene.filters.OcrCharFilterFactory"/> | <charFilter class="de.digitalcollections.solrocr.lucene.filters.OcrCharFilterFactory"/> |
| <charFilter class="solr.MappingCharFilterFactory" mapping="accents_und.txt"/> |
<tokenizer class="solr.StandardTokenizerFactory"/> | <tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.LowerCaseFilterFactory"/> | <filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> | <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> |
| <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" /> |
<filter class="solr.SnowballPorterFilterFactory" language="Italian"/> | <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> |
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="false" /> | <filter class="solr.LengthFilterFactory" min="3" max="100"/> |
</analyzer> | </analyzer> |
<analyzer type="query"> | <analyzer type="query"> |
| <charFilter class="solr.MappingCharFilterFactory" mapping="accents_und.txt"/> |
<tokenizer class="solr.StandardTokenizerFactory"/> | <tokenizer class="solr.StandardTokenizerFactory"/> |
<filter class="solr.LowerCaseFilterFactory"/> | <filter class="solr.LowerCaseFilterFactory"/> |
<filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> | <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> |
| <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" /> |
<filter class="solr.SnowballPorterFilterFactory" language="Italian"/> | <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> |
<filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="false" /> | <filter class="solr.LengthFilterFactory" min="3" max="100"/> |
</analyzer> | </analyzer> |
</fieldType> | </fieldType> |
| </code> |
| [[#solrsugg|Edit Solr schema for Suggester split on word without punctuation]] |
| <wrap #solrsugg></wrap> |
| <code bash> |
| $ sudo -u solr nano -w /srv/solr/data/archipelago/conf/schema.xml |
| </code> |
| <code xml> |
| <fieldType name="text_ws" class="solr.TextField" omitNorms="true" positionIncrementGap="100"> |
| <analyzer> |
| <!-- <tokenizer class="solr.WhitespaceTokenizerFactory"/> --> |
| <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| </analyzer> |
| </fieldType> |
</code> | </code> |