From 0805dde1d53c38b279244dcbb551c446458ad4be Mon Sep 17 00:00:00 2001 From: Marcel Otto Date: Tue, 7 May 2024 14:09:36 +0200 Subject: [PATCH] Add :pn_local_validation option and fix pn_local regex --- CHANGELOG.md | 5 +++ lib/rdf/serializations/turtle_trig/encoder.ex | 35 +++++++++++++++---- .../turtle_trig/encoder/state.ex | 4 ++- .../turtle_trig_encoder_test.exs | 6 +++- 4 files changed, 42 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ed81756a..a0cc650d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,8 @@ Elixir v1.15 or later, where this issue has been resolved. - Capability to add custom content on the Turtle/TriG encoders with the `:content` option. - Option `:line_prefix` on Turtle/TriG encoder for a function defining custom line prefixes. - Option `:indent_width` on Turtle/TriG encoder to customize the indentation width. +- Option `:pn_local_validation` on Turtle/TriG encoder for controlling IRI validation + when encoding as a prefixed name. - `RDF.Dataset.update_all_graphs/2` to apply a function on all graphs of a dataset. - `RDF.Dataset.named_graphs/1` to get a list of all named graphs of a dataset. - `RDF.Dataset.graph_names/1` to get a list of all graph names of a dataset. @@ -50,6 +52,9 @@ Elixir v1.15 or later, where this issue has been resolved. - The `RDF.Turtle.Encoder` was not falling back to using `RDF.default_prefixes/0` when the encoded graph had prefixes which were removed afterwards. +- Fixed the `RDF.Turtle.Encoder` validation to ensure IRIs with permissible characters, + such as hyphens, can be correctly encoded as prefixed names. Previously, the validation + was overly strict, preventing some valid IRIs from being encoded as prefixed names. - `RDF.NTriples.Encoder` and `RDF.NQuads.Encoder` could not stream quoted RDF-star triples could as iodata. diff --git a/lib/rdf/serializations/turtle_trig/encoder.ex b/lib/rdf/serializations/turtle_trig/encoder.ex index f94a7a43..1677ffd5 100644 --- a/lib/rdf/serializations/turtle_trig/encoder.ex +++ b/lib/rdf/serializations/turtle_trig/encoder.ex @@ -59,6 +59,17 @@ defmodule RDF.TurtleTriG.Encoder do - `:indent`: Allows to specify the number of spaces the output should be indented. - `:indent_width`: Allows to specify the number of spaces that should be used for indentations (default: 4). + - `:pn_local_validation`: This option controls how IRIs are validated the check + whether they can encoded as a prefixed name. Available settings are: + - `:fast` (default): Provides a quick and efficient validation that covers most + common cases. It does not handle every possible valid scenario, focusing instead + on typical structures encountered in prefixed names. + - `:none`: Disables validation entirely. Use this mode if you are confident that + all your IRIs are already compliant with prefixed name requirements, allowing you + to bypass validation checks for increased performance. + - Note: Currently, a `:strict` mode, which would provide comprehensive validation + conforming strictly to the Turtle specification, is not implemented. + Contributions for implementing this mode are welcome. """ end @@ -359,7 +370,7 @@ defmodule RDF.TurtleTriG.Encoder do defp term(%IRI{} = iri, state, _) do based_name(iri, state.base) || - prefixed_name(iri, state.prefixes) || + prefixed_name(iri, state.prefixes, state.pn_local_validation) || ["<", to_string(iri), ">"] end @@ -444,16 +455,28 @@ defmodule RDF.TurtleTriG.Encoder do ] end - def prefixed_name(iri, prefixes) do + def prefixed_name(iri, prefixes, pn_local_validation \\ :fast) do case PrefixMap.prefix_name_pair(prefixes, iri) do - {prefix, name} -> if valid_pn_local?(name), do: [prefix, ":", name] + {prefix, name} -> if valid_pn_local?(name, pn_local_validation), do: [prefix, ":", name] _ -> nil end end - defp valid_pn_local?(name) do - String.match?(name, ~r/^([[:alpha:]]|[[:digit:]]|_|:)*$/u) - end + @doc """ + Validates if the local part of a prefixed name is valid according to the Turtle grammar for a `PN_LOCAL`. + """ + def valid_pn_local?(pn_local, pn_local_validation \\ :fast) + + def valid_pn_local?(_pn_local, :none), do: true + def valid_pn_local?("", _), do: true + + @fast_pn_local_regex ~r/ + ^[\p{L}\p{Nd}_:%] + [\p{L}\p{Nd}\p{M}\p{Pc}_.:~\-%]* + [\p{L}\p{Nd}_:]?$ + /ux + + def valid_pn_local?(pn_local, :fast), do: String.match?(pn_local, @fast_pn_local_regex) defp quoted(string) do if String.contains?(string, ["\n", "\r"]) do diff --git a/lib/rdf/serializations/turtle_trig/encoder/state.ex b/lib/rdf/serializations/turtle_trig/encoder/state.ex index 861ff16c..0c117e43 100644 --- a/lib/rdf/serializations/turtle_trig/encoder/state.ex +++ b/lib/rdf/serializations/turtle_trig/encoder/state.ex @@ -6,8 +6,9 @@ defmodule RDF.TurtleTriG.Encoder.State do :data, :graph, :base, - :prefixes, :implicit_base, + :prefixes, + :pn_local_validation, :single_triple_lines, :line_prefix, :base_indent, @@ -43,6 +44,7 @@ defmodule RDF.TurtleTriG.Encoder.State do base: base, implicit_base: Keyword.get(opts, :implicit_base), prefixes: prefixes, + pn_local_validation: Keyword.get(opts, :pn_local_validation, :fast), base_indent: Keyword.get(opts, :indent), indent_step: opts |> Keyword.get(:indent_width, @default_indent_width) |> indent_string(), line_prefix: line_prefix, diff --git a/test/unit/serializations/turtle_trig_encoder_test.exs b/test/unit/serializations/turtle_trig_encoder_test.exs index b0d0f241..35a4f6d3 100644 --- a/test/unit/serializations/turtle_trig_encoder_test.exs +++ b/test/unit/serializations/turtle_trig_encoder_test.exs @@ -13,7 +13,7 @@ defmodule RDF.TurtleTriG.EncoderTest do defvocab EX, base_iri: "http://example.org/#", terms: [], strict: false - describe "prefixed_name/2" do + describe "prefixed_name/3 (fast)" do setup do {:ok, prefixes: @@ -26,6 +26,10 @@ defmodule RDF.TurtleTriG.EncoderTest do test "hash iri with existing prefix", %{prefixes: prefixes} do assert TurtleTriG.Encoder.prefixed_name(EX.foo(), prefixes) |> IO.iodata_to_binary() == "ex:foo" + + assert TurtleTriG.Encoder.prefixed_name(~I, prefixes) + |> IO.iodata_to_binary() == + "ex:foo-bar" end test "hash iri namespace without name", %{prefixes: prefixes} do