Skip to content

Commit

Permalink
NUTCH-3032 Code for an ArbitraryIndexingFilter to index values resolv…
Browse files Browse the repository at this point in the history
…ed by user POJO code at index time (#810)
  • Loading branch information
CatChullain authored Apr 4, 2024
1 parent 1563396 commit c9e2f4e
Show file tree
Hide file tree
Showing 11 changed files with 794 additions and 0 deletions.
4 changes: 4 additions & 0 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@
<packageset dir="${plugins.dir}/headings/src/java"/>
<packageset dir="${plugins.dir}/exchange-jexl/src/java"/>
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-arbitrary/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-geoip/src/java"/>
<packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
Expand Down Expand Up @@ -646,6 +647,7 @@
<packageset dir="${plugins.dir}/headings/src/java"/>
<packageset dir="${plugins.dir}/exchange-jexl/src/java"/>
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-arbitrary/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-geoip/src/java"/>
<packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
Expand Down Expand Up @@ -1173,6 +1175,8 @@
<source path="${plugins.dir}/exchange-jexl/src/java/" />
<source path="${plugins.dir}/index-anchor/src/java/" />
<source path="${plugins.dir}/index-anchor/src/test/" />
<source path="${plugins.dir}/index-arbitrary/src/java/" />
<source path="${plugins.dir}/index-arbitrary/src/test/" />
<source path="${plugins.dir}/index-basic/src/java/" />
<source path="${plugins.dir}/index-basic/src/test/" />
<source path="${plugins.dir}/index-geoip/src/java/" />
Expand Down
66 changes: 66 additions & 0 deletions conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2252,6 +2252,72 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
</description>
</property>

<!-- index-arbitrary plugin properties -->
<property>
<name>index.arbitrary.function.count</name>
<value></value>
<description>The count of arbitrary additions/edits to the document.
Specify the remaining properties (fieldName, className, constructorArgs,
methodName, and methodArgs) independently in this file by appending a
dot (.) followed by integer numerals (beginning with '0') to the property
names, e.g.:

index.arbitrary.fieldName.0
for the field to add/set with the first arbitrary addition or:

index.arbitrary.className.3
for the POJO class name to use in setting the fourth arbitrary addition.
</description>
</property>

<property>
<name>index.arbitrary.fieldName.0</name>
<value></value>
<description>The name of the field to add to the document with the value
returned from the custom POJO.</description>
</property>

<property>
<name>index.arbitrary.className.0</name>
<value></value>
<description>The fully qualified name of the POJO class that will supply
values for the new field.</description>
</property>

<property>
<name>index.arbitrary.constructorArgs.0</name>
<value></value>
<description>The values (as strings) to pass into the POJO constructor.
The POJO must accept a String representation of the NutchDocument's URL
as the first parameter in the constructor. The values you specify here
will populate the constructor arguments 1,..,n-1 where n=the count of
arguments to the constructor. Argument #0 will be the NutchDocument's URL.
</description>
</property>

<property>
<name>index.arbitrary.methodName.0</name>
<value></value>
<description>The name of the method to invoke on the instance of your custom
class in order to determine the value to add to the document.</description>
</property>

<property>
<name>index.arbitrary.methodArgs.0</name>
<value></value>
<description>The values (as strings) to pass into the named method on the POJO
instance. Unlike the constructor args, there is no required argument that this
method in the POJO must accept, i.e., the Arbitrary Indexer doesn't supply any
arguments taken from the NutchDocument values by default.</description>
</property>

<property>
<name>index.arbitrary.overwrite.0</name>
<description>Whether to overwrite any existing value in the doc for
for fieldName. Default is false if not specified in config</description>
<value></value>
</property>

<!-- parse-metatags plugin properties -->
<property>
<name>metatags.names</name>
Expand Down
3 changes: 3 additions & 0 deletions src/plugin/build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
<ant dir="headings" target="deploy"/>
<ant dir="exchange-jexl" target="deploy"/>
<ant dir="index-anchor" target="deploy"/>
<ant dir="index-arbitrary" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-geoip" target="deploy"/>
<ant dir="index-jexl-filter" target="deploy"/>
Expand Down Expand Up @@ -117,6 +118,7 @@
<ant dir="feed" target="test"/>
<ant dir="headings" target="test"/>
<ant dir="index-anchor" target="test"/>
<ant dir="index-arbitrary" target="test"/>
<ant dir="index-basic" target="test"/>
<!--ant dir="index-geoip" target="test"/-->
<ant dir="index-jexl-filter" target="test"/>
Expand Down Expand Up @@ -179,6 +181,7 @@
<ant dir="headings" target="clean"/>
<ant dir="exchange-jexl" target="clean"/>
<ant dir="index-anchor" target="clean"/>
<ant dir="index-arbitrary" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-geoip" target="clean"/>
<ant dir="index-jexl-filter" target="clean"/>
Expand Down
22 changes: 22 additions & 0 deletions src/plugin/index-arbitrary/build.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="index-arbitrary" default="jar-core">

<import file="../build-plugin.xml"/>

</project>
39 changes: 39 additions & 0 deletions src/plugin/index-arbitrary/ivy.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<ivy-module version="1.0">
<info organisation="org.apache.nutch" module="${ant.project.name}">
<license name="Apache 2.0"/>
<ivyauthor name="Apache Nutch Team" url="https://nutch.apache.org/"/>
<description>
Apache Nutch
</description>
</info>

<configurations>
<include file="../../../ivy/ivy-configurations.xml"/>
</configurations>

<publications>
<!--get the artifact from our module name-->
<artifact conf="master"/>
</publications>

<dependencies>
</dependencies>

</ivy-module>
42 changes: 42 additions & 0 deletions src/plugin/index-arbitrary/plugin.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<plugin
id="index-arbitrary"
name="Index Arbitrary"
version="1.0.0"
provider-name="nutch.org">

<runtime>
<library name="index-arbitrary.jar">
<export name="*"/>
</library>
</runtime>

<requires>
<import plugin="nutch-extensionpoints"/>
</requires>


<extension id="org.apache.nutch.indexer.arbitrary"
name="Nutch arbitrary data indexer"
point="org.apache.nutch.indexer.IndexingFilter">
<implementation id="ArbitraryIndexingFilter"
class="org.apache.nutch.indexer.arbitrary.ArbitraryIndexingFilter"/>
</extension>

</plugin>
Loading

0 comments on commit c9e2f4e

Please sign in to comment.