From 9be1886fa418ca2628fa2b3c02a9cba08ac51824 Mon Sep 17 00:00:00 2001 From: Will Coster Date: Sat, 28 May 2016 17:18:06 -0700 Subject: [PATCH] (#23, #25, #34) Document chroot tricks This commit adds the resolutions to #23 and #25 to README.md under a new tips & tricks section. Full source code for each is given in a new example as well. Additionally a backtracking bug was found in `chroot` while documenting these tricks and is fixed in this commit as well. --- CHANGELOG.md | 2 + README.md | 97 ++++++++++++++++++++++++ examples/complex-predicates/Main.hs | 44 +++++++++++ examples/generalized-repetition/Main.hs | 42 ++++++++++ examples/scalpel-examples.cabal | 16 ++++ src/Text/HTML/Scalpel/Internal/Scrape.hs | 7 +- tests/TestMain.hs | 10 +++ 7 files changed, 215 insertions(+), 3 deletions(-) create mode 100644 examples/complex-predicates/Main.hs create mode 100644 examples/generalized-repetition/Main.hs diff --git a/CHANGELOG.md b/CHANGELOG.md index 55b5282..5450a05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## HEAD +- Added resolution to #23 and #25 to README.md and added examples. + ## 0.3.1 - Added the `innerHTML` and `innerHTMLs` scraper. diff --git a/README.md b/README.md index 1bcfc5b..1313d86 100644 --- a/README.md +++ b/README.md @@ -109,3 +109,100 @@ allComments = scrapeURL "http://example.com/article.html" comments imageURL <- attr "src" $ "img" @: [hasClass "image"] return $ ImageComment author imageURL ``` + +Tips & Tricks +------------- + +The primitives provided by scalpel are intentionally minimalistic with the +assumption being that users will be able to build up complex functionality by +combining them with functions that work on existing type classes (Monad, +Applicative, Alternative, etc.). + +This section gives examples of common tricks for building up more complex +behavior from the simple primitives provided by this library. + +### Complex Predicates + +It is possible to run into scenarios where the name and attributes of a tag are +not sufficient to isolate interesting tags and properties of child tags need to +be considered. + +In these cases the `guard` function of the `Alternative` type class can be +combined with `chroot` and `Any` to implement predicates of arbitrary +complexity. + +Building off the above example, consider a use case where we would like find the +html contents of a comment that mentions the word "cat". + +The strategy will be the following: + +1. Isolate the comment div using `chroot`. + +2. Then within the context of that div the textual contents can be retrieved + with `text Any`. This works because the first tag within the current context + is the div tag selected by chroot, and the `Any` selector will match the + first tag within the current context. + +3. Then the predicate that `"cat"` appear in the text of the comment will be + enforced using `guard`. If the predicate fails, scalpel will backtrack and + continue the search for divs until one is found that matches the predicate. + +4. Return the desired HTML content of the comment div. + +```haskell +catComment :: Scraper String String +catComment = + -- 1. First narrow the current context to the div containing the comment's + -- textual content. + chroot ("div" @: [hasClass "comment", hasClass "text"]) $ do + -- 2. Any can be used to access the root tag of the current context. + contents <- text Any + -- 3. Skip comment divs that do not contain "cat". + guard ("cat" `isInfixOf` contents) + -- 4. Generate the desired value. + html Any +``` + +For the full source of this example, see +[complex-predicates](https://github.com/fimad/scalpel/tree/master/examples/complex-predicates/) +in the examples directory. + +### Generalized Repetition + +The pluralized versions of the primitive scrapers (`texts`, `attrs`, `htmls`) +allow the user to extract content from all of the tags matching a given +selector. For more complex scraping tasks it will at times be desirable to be +able to extract multiple values from the same tag. + +Like the previous example, the trick here is to use a combination of the +`chroots` function and the `Any` selector. + +Consider an extension to the original example where image comments may contain +some alt text and the desire is to return a tuple of the alt text and the URLs +of the images. + +The strategy will be the following: + +1. to isolate each img tag using `chroots`. + +2. Then within the context of each img tag, use the `Any` selector to extract + the alt and src attributes from the current tag. + +3. Create and return a tuple of the extracted attributes. + +```haskell +altTextAndImages :: Scraper String [(String, URL)] +altTextAndImages = + -- 1. First narrow the current context to each img tag. + chroots "img" $ do + -- 2. Use Any to access all the relevant content from the the currently + -- selected img tag. + altText <- attr "alt" Any + srcUrl <- attr "src" Any + -- 3. Combine the retrieved content into the desired final result. + return (altText, srcUrl) +``` + +For the full source of this example, see +[generalized-repetition](https://github.com/fimad/scalpel/tree/master/examples/generalized-repetition/) +in the examples directory. diff --git a/examples/complex-predicates/Main.hs b/examples/complex-predicates/Main.hs new file mode 100644 index 0000000..0628ae4 --- /dev/null +++ b/examples/complex-predicates/Main.hs @@ -0,0 +1,44 @@ +import Text.HTML.Scalpel +import Control.Applicative +import Control.Monad +import Data.List (isInfixOf) + + +exampleHtml :: String +exampleHtml = "\ +\ \ +\
\ +\
\ +\ Sally\ +\
Woo hoo!
\ +\
\ +\
\ +\ Bill\ +\ \ +\
\ +\
\ +\ Bertrand\ +\
That sure is some cat!
\ +\
\ +\
\ +\ Susan\ +\
WTF!?!
\ +\
\ +\
\ +\ \ +\" + +main :: IO () +main = print $ scrapeStringLike exampleHtml catComment + +catComment :: Scraper String String +catComment = + -- 1. First narrow the current context to the div containing the comment's + -- textual content. + chroot ("div" @: [hasClass "comment", hasClass "text"]) $ do + -- 2. Any can be used to access the root tag of the current context. + contents <- text Any + -- 3. Skip comment divs that do not contain "cat". + guard ("cat" `isInfixOf` contents) + -- 4. Generate the desired value. + html Any diff --git a/examples/generalized-repetition/Main.hs b/examples/generalized-repetition/Main.hs new file mode 100644 index 0000000..68a63fa --- /dev/null +++ b/examples/generalized-repetition/Main.hs @@ -0,0 +1,42 @@ +import Text.HTML.Scalpel + + +exampleHtml :: String +exampleHtml = "\ +\ \ +\
\ +\
\ +\ Sally\ +\
Woo hoo!
\ +\
\ +\
\ +\ Bill\ +\ A cat picture.\ +\
\ +\
\ +\ Susan\ +\
WTF!?!
\ +\
\ +\
\ +\ Bill\ +\ A dog picture.\ +\
\ +\
\ +\ \ +\" + +main :: IO () +main = print $ scrapeStringLike exampleHtml altTextAndImages + +altTextAndImages :: Scraper String [(String, URL)] +altTextAndImages = + -- 1. First narrow the current context to each img tag. + chroots "img" $ do + -- 2. Use Any to access all the relevant content from the the currently + -- selected img tag. + altText <- attr "alt" Any + srcUrl <- attr "src" Any + -- 3. Combine the retrieved content into the desired final result. + return (altText, srcUrl) diff --git a/examples/scalpel-examples.cabal b/examples/scalpel-examples.cabal index b1d7833..fdb7297 100644 --- a/examples/scalpel-examples.cabal +++ b/examples/scalpel-examples.cabal @@ -11,6 +11,14 @@ category: Web build-type: Simple cabal-version: >=1.10 +executable complex-predicates + default-language: Haskell2010 + main-is: complex-predicates/Main.hs + build-depends: + base >= 4.6 && < 5 + , scalpel >= 0.2.0 + ghc-options: -W + executable example-from-documentation default-language: Haskell2010 main-is: example-from-documentation/Main.hs @@ -19,6 +27,14 @@ executable example-from-documentation , scalpel >= 0.2.0 ghc-options: -W +executable generalized-repetition + default-language: Haskell2010 + main-is: generalized-repetition/Main.hs + build-depends: + base >= 4.6 && < 5 + , scalpel >= 0.2.0 + ghc-options: -W + executable list-all-images default-language: Haskell2010 main-is: list-all-images/Main.hs diff --git a/src/Text/HTML/Scalpel/Internal/Scrape.hs b/src/Text/HTML/Scalpel/Internal/Scrape.hs index 61945b6..c9e1285 100644 --- a/src/Text/HTML/Scalpel/Internal/Scrape.hs +++ b/src/Text/HTML/Scalpel/Internal/Scrape.hs @@ -71,9 +71,10 @@ scrape s = scrapeOffsets s . tagWithOffset . TagSoup.canonicalizeTags -- match every set of tags, use 'chroots'. chroot :: (Ord str, TagSoup.StringLike str, Selectable s) => s -> Scraper str a -> Scraper str a -chroot selector (MkScraper inner) = MkScraper - $ join . (inner <$>) - . listToMaybe . select selector +chroot selector inner = do + maybeResult <- listToMaybe <$> chroots selector inner + guard (isJust maybeResult) + return $ fromJust maybeResult -- | The 'chroots' function takes a selector and an inner scraper and executes -- the inner scraper as if it were scraping a document that consists solely of diff --git a/tests/TestMain.hs b/tests/TestMain.hs index adadc62..b3addb2 100644 --- a/tests/TestMain.hs +++ b/tests/TestMain.hs @@ -4,6 +4,8 @@ module Main (main) where import Text.HTML.Scalpel import Control.Applicative +import Control.Monad (guard) +import Data.List (isInfixOf) import System.Exit import Test.HUnit @@ -224,6 +226,14 @@ scrapeTests = "scrapeTests" ~: TestList [ "foobar" (Just ["foo","bar"]) (innerHTMLs "a") + + , scrapeTest + "foobarbaz" + (Just "bar") + (chroot "a" $ do + t <- text Any + guard ("b" `isInfixOf` t) + html Any) ] scrapeTest :: (Eq a, Show a) => String -> Maybe a -> Scraper String a -> Test