Skip to content

Commit

Permalink
Use GETOPT for the command line scripts #326: index script
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Nov 3, 2023
1 parent 0dc4c92 commit f70d1df
Show file tree
Hide file tree
Showing 8 changed files with 246 additions and 168 deletions.
19 changes: 14 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1682,19 +1682,28 @@ or

options:
* [general parameters](#general-parameters)
* `-s <URL>`, `--solrUrl <URL>`: the URL of Solr server
* `-c`, `--doCommit`: send commits to Solr regularly (not needed if you set up
* `-S <URL>`, `--solrUrl <URL>`: the URL of Solr server including the core (e.g. http://localhost:8983/solr/loc)
* `-A`, `--doCommit`: send commits to Solr regularly (not needed if you set up
Solr the above described way)
* `-t <type>`, `--solrFieldType <type>`: a Solr field type, one of the
* `-T <type>`, `--solrFieldType <type>`: a Solr field type, one of the
predefined values. See examples below.
* `marc-tags` - the field names are MARC codes
* `human-readable` - the field names are
[Self Descriptive MARC code](http://pkiraly.github.io/2017/09/24/mapping/)
* `mixed` - the field names are mixed of the above (e.g. `245a_Title_mainTitle`)
* `-A <URL>`, `--validationUrl <URL>`: the URL of the Solr server used in validation
* `-C`, `--indexWithTokenizedField`: index data elements as tokenized field as well (each bibliographical data elements
will be indexed twice: once as a phrase (fields suffixed with `_ss`), and once as a bag of words (fields suffixed
with `_txt`). \[This flag is available from v0.8.0\]
with `_txt`). \[This parameter is available from v0.8.0\]
* `-D <int>`, `--commitAt <int>`: commit index after this number of records \[This parameter is available from v0.8.0\]
* `-E`, `--indexFieldCounts`: index the count of field instances \[This parameter is available from v0.8.0\]

The `./index` file (which is used by `catalogues/[catalogue].sh` and `./qa-catalogue` scripts) has additional parameters:
* `-Z <core>`, `--core <core>`: The index name (core). If not set it will be extracted from the `solrUrl` parameter
* `-Y <path>`, `--file-path <path>`: File path
* `-X <mask>`, `--file-mask <mask>`: File mask
* `-W`, `--purge`: Purge index and exit
* `-V`, `--status`: Show the status of index(es) and exit
* `-U`, `--no-delete`: Do not delete documents in index before starting indexing (be default the script clears the index)

The Solr URL is something like this: http://localhost:8983/solr/loc. It uses
the [Self Descriptive MARC code](http://pkiraly.github.io/2017/09/24/mapping/),
Expand Down
3 changes: 2 additions & 1 deletion catalogues/k10plus_pica_grouped.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
. ./setdir.sh

SCHEMA=PICA
TYPE_PARAMS="--emptyLargeCollectors --indexWithTokenizedField"
TYPE_PARAMS="--emptyLargeCollectors"
TYPE_PARAMS="$TYPE_PARAMS --groupBy 001@\$0"
TYPE_PARAMS="$TYPE_PARAMS --groupListFile src/main/resources/k10plus-libraries-by-unique-iln.txt"
TYPE_PARAMS="$TYPE_PARAMS --ignorableFields 001@,001E,001L,001U,001U,001X,001X,002V,003C,003G,003Z,008G,017N,020F,027D,031B,037I,039V,042@,046G,046T,101@,101E,101U,102D,201E,201U,202D,1...,2..."
#TYPE_PARAMS="$TYPE_PARAMS --ignorableIssueTypes undefinedField"
TYPE_PARAMS="$TYPE_PARAMS --allowableRecords base64:"$(echo '002@.0 !~ "^L" && 002@.0 !~ "^..[iktN]" && (002@.0 !~ "^.v" || 021A.a?)' | base64 -w 0)
TYPE_PARAMS="$TYPE_PARAMS --solrUrl http://localhost:8983/solr/k10plus_pica_grouped"
TYPE_PARAMS="$TYPE_PARAMS --solrForScoresUrl http://localhost:8983/solr/k10plus_pica_grouped_validation"
TYPE_PARAMS="$TYPE_PARAMS --indexWithTokenizedField"
TYPE_PARAMS="$TYPE_PARAMS --indexFieldCounts"
Expand Down
10 changes: 5 additions & 5 deletions common-script
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ do_index() {
untrace

PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+//g')
HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
PARAMS="${PARAMS} --validationCore ${NAME}_validation"
fi
# HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
# if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
# PARAMS="${PARAMS} --solrForScoresUrl ${NAME}_validation"
# fi
PARAMS="${PARAMS} --outputDir ${OUTPUT_DIR}"

ONLY_INDEX=$(echo ${PARAMS} | grep -c -P -e '--onlyIndex' || true)
Expand All @@ -63,7 +63,7 @@ do_index() {
PARAMS=$(echo ${PARAMS} | sed -r 's/\s*--onlyIndex//')
CORE=${NAME}
fi
./index --db ${CORE} --file-path ${MARC_DIR} --file-mask $MASK ${PARAMS} --trimId 2>> ${PREFIX}/solr.log
./index --core ${CORE} --file-path ${MARC_DIR} --file-mask $MASK ${PARAMS} --trimId 2>> ${PREFIX}/solr.log
}

do_postprocess_solr() {
Expand Down
315 changes: 171 additions & 144 deletions index

Large diffs are not rendered by default.

47 changes: 40 additions & 7 deletions scripts/cli-generator/generate.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
define('TITLE', 'QA catalogue %s');

$fileName = $argv[1];
$extraOptions = [];
$existentialFlags = [];
switch ($fileName) {
case 'validate.txt':
$url = URL . '#validating-marc-records';
Expand Down Expand Up @@ -37,6 +39,20 @@
$url = URL . '#shacl4bib';
$title = sprintf(TITLE, 'custom validation'); break;

case 'index.txt':
$url = URL . '#indexing-marc-records-with-solr';
$title = sprintf(TITLE, 'Indexing MARC records with Solr');
$extraOptions = [
(object)['short' => 'Z', 'long' => 'core', 'hasArg' => true, 'help' => 'The index name (core)', 'var' => 'CORE'],
(object)['short' => 'Y', 'long' => 'file-path', 'hasArg' => true, 'help' => 'File path', 'var' => 'FILE_PATH'],
(object)['short' => 'X', 'long' => 'file-mask', 'hasArg' => true, 'help' => 'File mask', 'var' => 'FILE_MASK'],
(object)['short' => 'W', 'long' => 'purge', 'hasArg' => false, 'help' => 'Purge index', 'var' => 'DO_PURGE'],
(object)['short' => 'V', 'long' => 'status', 'hasArg' => false, 'help' => 'Show the status of index(es)', 'var' => 'DO_STATUS'],
(object)['short' => 'U', 'long' => 'no-delete', 'hasArg' => false, 'help' => 'Do not delete index before starting indexing', 'var' => 'SKIP_DELETE']
];
$existentialFlags = ['solrUrl'];
break;

case 'completeness.txt':
default:
$url = URL . '#calculating-data-element-completeness';
Expand All @@ -47,6 +63,8 @@
$index = (object)['longs' => [], 'shorts' => []];
$options = readOptions('common.txt', $index);
$options = array_merge($options, readOptions($fileName, $index));
if (!empty($extraOptions))
$options = array_merge($options, $extraOptions);

createHelp($options);
echo LN;
Expand Down Expand Up @@ -99,25 +117,40 @@ function createCommandArguments($options) {
function createParser($options) {
global $maxLong;

$variables = [];
$lines = [];
foreach ($options as $option) {
$line = sprintf(' -%s|--%s) ', $option->short, $option->long);
$line .= str_pad(' ', $maxLong + 1 - strlen($option->long));
if ($option->hasArg) {
$line .= sprintf('PARAMS="$PARAMS --%s $2" ;%sshift 2 ;;', $option->long, str_pad(' ', $maxLong + 1 - strlen($option->long)));
if (isset($option->var)) {
$variables[] = sprintf('%s=0', $option->var);
$line .= sprintf('%s="$2" ;%sshift 2 ;;', $option->var, str_pad(' ', $maxLong + 18 - strlen($option->var)));
} else {
$line .= sprintf('PARAMS="$PARAMS --%s $2" ;%sshift 2 ;;', $option->long, str_pad(' ', $maxLong + 1 - strlen($option->long)));
}
} else {
$line .= sprintf(
'PARAMS="$PARAMS --%s" ;%s%s shift ;;',
$option->long,
($option->long == 'help' ? ' HELP=1; ' : ''),
str_pad(' ', $maxLong + ($option->long == 'help' ? -6 : 3) - strlen($option->long))
);
if (isset($option->var)) {
$variables[] = sprintf('%s=0', $option->var);
$line .= sprintf('%s=1 ;%sshift ;;', $option->var,
str_pad(' ', $maxLong + 21 - strlen($option->var))
);
} else {
$line .= sprintf(
'PARAMS="$PARAMS --%s" ;%s%s shift ;;',
$option->long,
($option->long == 'help' ? ' HELP=1; ' : ''),
str_pad(' ', $maxLong + ($option->long == 'help' ? -6 : 3) - strlen($option->long))
);
}
}
$lines[] = $line;
}
$allVariables = join(LN, $variables);
$cases = join(LN, $lines);

print <<<END
$allVariables
PARAMS=""
HELP=0
while true ; do
Expand Down
7 changes: 7 additions & 0 deletions scripts/cli-generator/index.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
options.addOption("S", "solrUrl", true, "the URL of Solr server including the core (e.g. http://localhost:8983/solr/loc)");
options.addOption("A", "doCommit", false, "commits Solr index regularly");
options.addOption("T", "solrFieldType", true, "type of Solr fields, could be one of 'marc-tags', 'human-readable', or 'mixed'");
options.addOption("B", "useEmbedded", false, "use embedded Solr server (used in tests only)");
options.addOption("C", "indexWithTokenizedField", false, "index data elements as tokenized field as well");
options.addOption("D", "commitAt", true, "commit index after this number of records");
options.addOption("E", "indexFieldCounts", false, "index the count of field instances");
6 changes: 3 additions & 3 deletions solr-functions
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ status() {

purge_core() {
LOCAL_CORE=$1
echo "Delete records in ${LOCAL_CORE}"
SOLR_DB_URL="${SOLR_HOST}/solr/${LOCAL_CORE}"
echo "Delete records in ${SOLR_DB_URL}"
curl -s $SOLR_DB_URL/update \
-H "Content-type: text/xml" \
--data-binary '<delete><query>*:*</query></delete>' \
Expand All @@ -161,8 +161,8 @@ purge_core() {

optimize_core() {
LOCAL_CORE=$1
echo "Optimizing ${LOCAL_CORE}"
SOLR_DB_URL="${SOLR_HOST}/solr/${LOCAL_CORE}"
echo "Optimizing ${SOLR_DB_URL}"
curl -s "$SOLR_DB_URL/update?optimize=true" \
-H 'Content-type: text/xml' \
--data-binary '<commit/>' \
Expand All @@ -175,7 +175,7 @@ purge_and_exit() {
purge_core $LOCAL_CORE
optimize_core $LOCAL_CORE
else
echo "You should give the name with --db parameter"
echo "You should give the name with --core parameter"
fi
exit 1
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ public class MarcToSolrParameters extends CommonParameters {
protected void setOptions() {
if (!isOptionSet) {
super.setOptions();
options.addOption("s", "solrUrl", true, "the URL of Solr server");
options.addOption("c", "doCommit", false, "commits Solr index regularly");
options.addOption("t", "solrFieldType", true,
options.addOption("S", "solrUrl", true, "the URL of Solr server including the core (e.g. http://localhost:8983/solr/loc)");
options.addOption("A", "doCommit", false, "commits Solr index regularly");
options.addOption("T", "solrFieldType", true,
"type of Solr fields, could be one of 'marc-tags', 'human-readable', or 'mixed'");
options.addOption("B", "useEmbedded", false, "use embedded Solr server (used in tests only)");
options.addOption("C", "indexWithTokenizedField", false, "index data elements as tokenized field as well");
Expand Down Expand Up @@ -110,6 +110,7 @@ public String formatParameters() {
text += String.format("doCommit: %s%n", doCommit);
text += String.format("solrFieldType: %s%n", solrFieldType);
text += String.format("indexWithTokenizedField: %s%n", indexWithTokenizedField);
text += String.format("commitAt: %s%n", commitAt);
text += String.format("indexFieldCounts: %s%n", indexFieldCounts);
return text;
}
Expand Down

0 comments on commit f70d1df

Please sign in to comment.