Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

10046/fix/question char not searchable fixed #10099

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 40 additions & 36 deletions conf/solr/conf/managed-schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -578,42 +578,46 @@
cases will work, for example query "wi fi" will match
document "WiFi" or "wi-fi".
-->
<dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true"/>
<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer name="whitespace"/>
<!-- in this example, we will only use synonyms at query time
<filter name="synonymGraph" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
-->
<!-- Disabling because it's causing a few issues with our queries. See https://github.com/internetarchive/openlibrary/issues/3317#issuecomment-837506502 -->
<!-- <filter name="stop"
ignoreCase="true"
words="lang/stopwords_en.txt"
/> -->
<filter name="icuFolding"/>
<filter name="wordDelimiterGraph" types="wordtypes.txt" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter name="lowercase"/>
<filter name="keywordMarker" protected="protwords.txt"/>
<filter name="porterStem"/>
<filter name="flattenGraph" />
</analyzer>
<analyzer type="query">
<tokenizer name="whitespace"/>
<filter name="synonymGraph" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<!-- Disabling because it's causing a few issues with our queries. See https://github.com/internetarchive/openlibrary/issues/3317#issuecomment-837506502 -->
<!-- <filter name="stop"
ignoreCase="true"
words="lang/stopwords_en.txt"
/> -->
<filter name="icuFolding"/>
<filter name="wordDelimiterGraph" types="wordtypes.txt" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter name="lowercase"/>
<filter name="keywordMarker" protected="protwords.txt"/>
<filter name="porterStem"/>
</analyzer>
</fieldType>
<dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true"/>
<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer name="whitespace"/>
<!-- in this example, we will only use synonyms at query time
<filter name="synonymGraph" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
-->
<!-- Disabling because it's causing a few issues with our queries. See https://github.com/internetarchive/openlibrary/issues/3317#issuecomment-837506502 -->
<!-- <filter name="stop"
ignoreCase="true"
words="lang/stopwords_en.txt"
/> -->
<filter name="icuFolding"/>
<filter name="wordDelimiterGraph" types="wordtypes.txt" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1" splitOnNumerics="1" preserveOriginal="1"/>
<filter name="lowercase"/>
<filter name="keywordMarker" protected="protwords.txt"/>
<filter name="porterStem"/>
<filter name="flattenGraph" />
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
maxPosAsterisk="2" maxPosQuestion="1" minTrailing="2" maxFractionAsterisk="0"/>
</analyzer>
<analyzer type="query">
<tokenizer name="whitespace"/>
<filter name="synonymGraph" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<!-- Disabling because it's causing a few issues with our queries. See https://github.com/internetarchive/openlibrary/issues/3317#issuecomment-837506502 -->
<!-- <filter name="stop"
ignoreCase="true"
words="lang/stopwords_en.txt"
/> -->
<filter name="icuFolding"/>
<filter name="wordDelimiterGraph" types="wordtypes.txt" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1" splitOnNumerics="1" preserveOriginal="1"/>
<filter name="lowercase"/>
<filter name="keywordMarker" protected="protwords.txt"/>
<filter name="porterStem"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
maxPosAsterisk="2" maxPosQuestion="1" minTrailing="2" maxFractionAsterisk="0"/>
</analyzer>
</fieldType>

<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
Expand Down
4 changes: 3 additions & 1 deletion conf/solr/conf/wordtypes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,6 @@
# "copyright" symbol (©), see https://github.com/internetarchive/openlibrary/issues/7555
\u00A9 => ALPHANUM
# number sign (#), only as alphabetic for case like 'C#'
\u0023 => ALPHA
\u0023 => ALPHA
# question mark (?), treat as delimiter
\u003F => ALPHANUM
Loading