| Both sides previous revision
Previous revision
Next revision
|
Previous revision
|
stepbystep:baseservices [2021/03/29 15:24] giancarlo [Cantaloupe] |
stepbystep:baseservices [2021/07/15 16:23] (current) giancarlo [SOLR] |
| \\ | \\ |
| \\ | \\ |
| [[#anchor]] | [[#cantaupdate5|Update Cantaloupe to 5.0 as this version was released.]] |
| <wrap #anchor>text or nothing</wrap> | <wrap #cantaupdate5></wrap> |
| Update Cantaloupe to 5.0 as this version was released. | |
| <code bash> | <code bash> |
| wget https://github.com/cantaloupe-project/cantaloupe/releases/download/v5.0/cantaloupe-5.0.zip | wget https://github.com/cantaloupe-project/cantaloupe/releases/download/v5.0/cantaloupe-5.0.zip |
| sudo mv target/solr-ocrhighlighting-0.6.0-SNAPSHOT.jar /opt/solr/contrib/archipelago/lib/ | sudo mv target/solr-ocrhighlighting-0.6.0-SNAPSHOT.jar /opt/solr/contrib/archipelago/lib/ |
| </code> | </code> |
| | We need last ocrhighlighting from master (0.7.0 SNAPSHOT) |
| | <code bash> |
| | $ wget https://github.com/dbmdz/solr-ocrhighlighting/archive/main.zip |
| | $ unzip main.zip |
| | $ cd solr-ocrhighlighting-main/ |
| | $ JAVA_HOME=/usr/lib/jvm/java-14-openjdk-amd64 mvn package |
| | $ cd .. |
| | $ sudo mv /opt/solr/contrib/archipelago/lib/solr-ocrhighlighting-0.6.0-SNAPSHOT.jar ./ |
| | $ sudo mv solr-ocrhighlighting-main/target/solr-ocrhighlighting-0.7.0-SNAPSHOT.jar /opt/solr/contrib/archipelago/lib/ |
| | $ rm -R solr-ocrhighlighting-main |
| | |
| | Reload Solr |
| | </code> |
| | Update to ocrhighlighting 0.7.0 release |
| | <code bash> |
| | $ wget https://github.com/dbmdz/solr-ocrhighlighting/archive/refs/tags/0.7.0.zip |
| | $ unzip 0.7.0.zip |
| | $ cd solr-ocrhighlighting-0.7.0 |
| | $ JAVA_HOME=/usr/lib/jvm/java-14-openjdk-amd64 mvn package |
| | $ cd .. |
| | $ sudo mv /opt/solr/contrib/archipelago/lib/solr-ocrhighlighting-0.7.0-SNAPSHOT.jar ./ |
| | $ sudo mv solr-ocrhighlighting-0.7.0/target/solr-ocrhighlighting-0.7.0.jar /opt/solr/contrib/archipelago/lib/ |
| | $ rm -R solr-ocrhighlighting-0.7.0 |
| | |
| | Reload Solr |
| | </code> |
| | |
| Create archipelago core | Create archipelago core |
| <code bash> | <code bash> |
| $ sudo systemctl start solr | $ sudo systemctl start solr |
| </code> | </code> |
| Edit Solr filters for Italian text language | [[#solrita|Edit Solr filters for Italian text language]] |
| | <wrap #solrita></wrap> |
| <code bash> | <code bash> |
| $ sudo -u solr nano -w /srv/solr/data/archipelago/conf/schema_extra_types.xml | $ sudo -u solr nano -w /srv/solr/data/archipelago/conf/schema_extra_types.xml |
| <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> | <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> |
| <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> |
| | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> | <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> |
| <filter class="solr.LengthFilterFactory" min="2" max="100"/> | <filter class="solr.LengthFilterFactory" min="3" max="100"/> |
| <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> | |
| </analyzer> | </analyzer> |
| <analyzer type="query"> | <analyzer type="query"> |
| <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> | <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> |
| <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> |
| | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> | <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> |
| <filter class="solr.LengthFilterFactory" min="2" max="100"/> | <filter class="solr.LengthFilterFactory" min="3" max="100"/> |
| <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> | |
| </analyzer> | </analyzer> |
| </fieldType> | </fieldType> |
| | |
| |
| <fieldType name="text_ocr_stored" class="solr.TextField" storeOffsetsWithPositions="true" termVectors="true"> | <fieldType name="text_ocr_stored" class="solr.TextField" storeOffsetsWithPositions="true" termVectors="true"> |
| <analyzer type="index"> | <analyzer type="index"> |
| <charFilter class="de.digitalcollections.solrocr.lucene.filters.OcrCharFilterFactory"/> | <charFilter class="de.digitalcollections.solrocr.lucene.filters.OcrCharFilterFactory"/> |
| | <charFilter class="solr.MappingCharFilterFactory" mapping="accents_und.txt"/> |
| <tokenizer class="solr.StandardTokenizerFactory"/> | <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> | <filter class="solr.LowerCaseFilterFactory"/> |
| <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> | <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> |
| <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> |
| | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> | <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> |
| <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="false" /> | <filter class="solr.LengthFilterFactory" min="3" max="100"/> |
| </analyzer> | </analyzer> |
| <analyzer type="query"> | <analyzer type="query"> |
| | <charFilter class="solr.MappingCharFilterFactory" mapping="accents_und.txt"/> |
| <tokenizer class="solr.StandardTokenizerFactory"/> | <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> | <filter class="solr.LowerCaseFilterFactory"/> |
| <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> | <filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt"/> |
| <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> | <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt"/> |
| | <filter class="solr.KeywordMarkerFilterFactory" protected="protwords_und.txt" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> | <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> |
| <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="false" /> | <filter class="solr.LengthFilterFactory" min="3" max="100"/> |
| </analyzer> | </analyzer> |
| </fieldType> | </fieldType> |
| | </code> |
| | [[#solrsugg|Edit Solr schema for Suggester split on word without punctuation]] |
| | <wrap #solrsugg></wrap> |
| | <code bash> |
| | $ sudo -u solr nano -w /srv/solr/data/archipelago/conf/schema.xml |
| | </code> |
| | <code xml> |
| | <fieldType name="text_ws" class="solr.TextField" omitNorms="true" positionIncrementGap="100"> |
| | <analyzer> |
| | <!-- <tokenizer class="solr.WhitespaceTokenizerFactory"/> --> |
| | <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
| | <filter class="solr.LowerCaseFilterFactory"/> |
| | </analyzer> |
| | </fieldType> |
| </code> | </code> |