diff --git a/Src/NReadability/NReadability.Tests/NReadability.Tests.csproj b/Src/NReadability/NReadability.Tests/NReadability.Tests.csproj index 1ddfc2b..14e767b 100644 --- a/Src/NReadability/NReadability.Tests/NReadability.Tests.csproj +++ b/Src/NReadability/NReadability.Tests/NReadability.Tests.csproj @@ -1,4 +1,4 @@ - + Debug @@ -137,6 +137,12 @@ PreserveNewest + + PreserveNewest + + + PreserveNewest + PreserveNewest @@ -206,6 +212,9 @@ PreserveNewest + + PreserveNewest + diff --git a/Src/NReadability/NReadability.Tests/NReadabilityWebTranscoderTests.cs b/Src/NReadability/NReadability.Tests/NReadabilityWebTranscoderTests.cs index 400d962..b30b0f5 100644 --- a/Src/NReadability/NReadability.Tests/NReadabilityWebTranscoderTests.cs +++ b/Src/NReadability/NReadability.Tests/NReadabilityWebTranscoderTests.cs @@ -113,13 +113,22 @@ public class NReadabilityWebTranscoderTests @"http://www.sparknotes.com/lit/mocking/section2.rhtml", } }, + { + 10, + new[] + { + @"http://www.ilr.cornell.edu/trianglefire/story/introduction.html", + @"http://www.ilr.cornell.edu/trianglefire/story/sweatshopsStrikes.html", + @"http://www.ilr.cornell.edu/trianglefire/story/investigationTrial.html", + } + }, }; #endregion [Test] [Sequential] - public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8, 9)]int sampleInputNumber) + public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)]int sampleInputNumber) { const string outputDir = "SampleWebOutput"; @@ -227,6 +236,18 @@ public void TestSampleInputs([Values(1, 2, 3, 4, 5, 6, 7, 8, 9)]int sampleInputN Assert.IsTrue(extractedContent.Contains("educational technique but the law.")); break; + case 10: + // page 1 + Assert.IsTrue(extractedContent.Contains("he fire at the Triangle Waist Company")); + Assert.IsTrue(extractedContent.Contains("at the hands of industrial greed.")); + // page 2 + Assert.IsTrue(extractedContent.Contains("he Triangle Waist Company was in many ways")); + Assert.IsTrue(extractedContent.Contains("unsafe working conditions on their employees.")); + // page 3 (last) + Assert.IsTrue(extractedContent.Contains("mmediately after the fire, Triangle owners Blanck and Harris")); + Assert.IsTrue(extractedContent.Contains("and that it was \"second to none in the country.\"")); + break; + default: throw new NotSupportedException("Unknown sample input number (" + sampleInputNumber + "). Have you added another sample input? If so, then add appropriate asserts here as well."); } diff --git a/Src/NReadability/NReadability.Tests/SampleWebInput/SampleInput_10_1.html b/Src/NReadability/NReadability.Tests/SampleWebInput/SampleInput_10_1.html new file mode 100644 index 0000000..06a3053 --- /dev/null +++ b/Src/NReadability/NReadability.Tests/SampleWebInput/SampleInput_10_1.html @@ -0,0 +1,141 @@ + + + + + + + + + +Cornell University - ILR School - The Triangle Factory Fire - Introduction + + + + + + + + + + + +
+
+ + Remembering the Triangle Factory Fire, 100 years later + +
+ +
+ +
+
+
+ +

INTRODUCTION

+ +
+
+
+ +

The fire at the Triangle Waist Company in New York City, which claimed the lives of 146 young immigrant workers, is one of the worst disasters since the beginning of the Industrial Revolution.

+

This incident has had great significance to this day because it highlights the inhumane working conditions to which industrial workers can be subjected. To many, its horrors epitomize the extremes of industrialism.

+

The tragedy still dwells in the collective memory of the nation and of the international labor movement. The victims of the tragedy are still celebrated as martyrs at the hands of industrial greed.

+ +

Continue »

+ +
+
+
+
+ +
+ Water from fire hoses spraying the top floors of the Asch Building +
+ + +
+

RELATED RESOURCES

+
+ + + + +
+ +
+ +
+ +
+ + diff --git a/Src/NReadability/NReadability.Tests/SampleWebInput/SampleInput_10_2.html b/Src/NReadability/NReadability.Tests/SampleWebInput/SampleInput_10_2.html new file mode 100644 index 0000000..8fe2a40 --- /dev/null +++ b/Src/NReadability/NReadability.Tests/SampleWebInput/SampleInput_10_2.html @@ -0,0 +1,161 @@ + + + + + + + + + +Cornell University - ILR School - The Triangle Factory Fire - Sweatshops and Strikes Before 1911 + + + + + + + + + + + +
+
+ + Remembering the Triangle Factory Fire, 100 years later + +
+ +
+ +
+
+
+ +

SWEATSHOPS & STRIKES BEFORE 1911

+ +
+
+
+ +

The Triangle Waist Company was in many ways a typical sweated factory in the heart of Manhattan, at 23-29 Washington Place, at the northern corner of Washington Square East. Low wages, excessively long hours, and unsanitary and dangerous working conditions were the hallmarks of sweatshops.

+

Even though many workers toiled under one roof in the Asch building, owned by Max Blanck and Isaac Harris, the owners subcontracted much work to individuals who hired the hands and pocketed a portion of the profits. Subcontractors could pay the workers whatever rates they wanted, often extremely low. The owners supposedly never knew the rates paid to the workers, nor did they know exactly how many workers were employed at their factory at any given point. Such a system led to exploitation.

+

Even today, sweatshops have not disappeared in the United States. They keep attracting workers in desperate need of employment and undocumented immigrants, who may be anxious to avoid involvement with governmental agencies. Recent studies conducted by the U.S. Department of Labor found that 67% of Los Angeles garment factories and 63% of New York garment factories violate minimum wage and overtime laws. Ninety-eight percent of Los Angeles garment factories have workplace health and safety problems serious enough to lead to severe injuries or death.

+

The International Ladies' Garment Workers Union organized workers in the women's clothing trade. Many of the garment workers before 1911 were unorganized, partly because they were young immigrant women intimidated by the alien surroundings. Others were more daring, though. All were ripe for action against the poor working conditions. In 1909, an incident at the Triangle Factory sparked a spontaneous walkout of its 400 employees. The Women's Trade Union League, a progressive association of middle class white women, helped the young women workers picket and fence off thugs and police provocation. At a historic meeting at Cooper Union, thousands of garment workers from all over the city followed young Clara Lemlich's call for a general strike.

+

With the cloakmakers' strike of 1910, a historic agreement was reached, that established a grievance system in the garment industry. Unfortunately for the workers, though, many shops were still in the hands of unscrupulous owners, who disregarded basic workers' rights and imposed unsafe working conditions on their employees.

+

Continue »

+ +
+
+
+
+ +
+ Sweatshop conditions in the early 1900's +
+ + +
+

RELATED RESOURCES

+
+ + + + +
+ +
+ +
+ +
+ + diff --git a/Src/NReadability/NReadability.Tests/SampleWebInput/SampleInput_10_3.html b/Src/NReadability/NReadability.Tests/SampleWebInput/SampleInput_10_3.html new file mode 100644 index 0000000..0d62c4e --- /dev/null +++ b/Src/NReadability/NReadability.Tests/SampleWebInput/SampleInput_10_3.html @@ -0,0 +1,147 @@ + + + + + + + + + +Cornell University - ILR School - The Triangle Factory Fire - Investigation and Trial + + + + + + + + + + + +
+
+ + Remembering the Triangle Factory Fire, 100 years later + +
+ +
+ +
+
+
+ +

INVESTIGATION & TRIAL

+ +
+
+
+ +

Immediately after the fire, Triangle owners Blanck and Harris declared in interviews that their building was fireproof, and that it had just been approved by the Department of Buildings. Yet the call for bringing those responsible to justice and reports that the doors of the factory were locked at the time of the fire prompted the District Attorney's office to seek an indictment against the owners. On April 11, a grand jury indicted Harris and Blanck on seven counts, charging them with manslaughter in the second degree under section 80 of the Labor Code, which mandated that doors should not be locked during working hours.

+

Justice?

+

On December 27, twenty-three days after the trial had started, a jury acquitted Blanck and Harris of any wrong doing. The task of the jurors had been to determine whether the owners knew that the doors were locked at the time of the fire.

+

Customarily, the only way out for workers at quitting time was through an opening on the Green Street side, where all pocketbooks were inspected to prevent stealing. Worker after worker testified to their inability to open the doors to their only viable escape route, the stairs to the Washington Place exit, because the Greene Street side stairs were completely engulfed by fire. More testimony supported this fact. Yet the brilliant defense attorney Max Steuer planted enough doubt in the jurors' minds to win a not-guilty verdict. Grieving families and much of the public felt that justice had not been done. "Justice!" they cried. "Where is justice?"

+

Twenty-three individual civil suits were brought against the owners of the Asch building. On March 11, 1914, three years after the fire, Harris and Blanck settled. They paid 75 dollars per life lost.

+

Harris and Blanck were to continue their defiant attitude toward the authorities. Just a few days after the fire, the new premises of their factory had been found not to be fireproof, without fire escapes, and without adequate exits.

+

In August of 1913, Max Blanck was charged with locking one of the doors of his factory during working hours. Brought to court, he was fined twenty dollars, and the judge apologized to him for the imposition.

+

In December of 1913, the interior of his factory was found to be littered with rubbish piled six feet high, with scraps kept in non-regulation, flammable wicker baskets. This time, instead of a court appearance and a fine, he was served a stern warning. The Triangle Waist Company was to cease operations in 1918, but the owners maintained throughout that their factory was a "model of cleanliness and sanitary conditions," and that it was "second to none in the country."

+ +
+
+
+
+ +
+ Door handle found on the ninth floor with the bolt in the locked position +
+ + +
+

RELATED RESOURCES

+
+ + + + +
+ +
+ +
+ +
+ + diff --git a/Src/NReadability/NReadability/NReadabilityTranscoder.cs b/Src/NReadability/NReadability/NReadabilityTranscoder.cs index 7b67a8d..482fd96 100644 --- a/Src/NReadability/NReadability/NReadabilityTranscoder.cs +++ b/Src/NReadability/NReadability/NReadabilityTranscoder.cs @@ -118,8 +118,9 @@ private class LinkData private static readonly Regex _ArticleTitleDashRegex2 = new Regex("(.*)[\\|\\-] .*", RegexOptions.Compiled); private static readonly Regex _ArticleTitleDashRegex3 = new Regex("[^\\|\\-]*[\\|\\-](.*)", RegexOptions.Compiled); private static readonly Regex _ArticleTitleColonRegex1 = new Regex(".*:(.*)", RegexOptions.Compiled); - private static readonly Regex _ArticleTitleColonRegex2 = new Regex("[^:]*[:](.*)", RegexOptions.Compiled); - private static readonly Regex _NextLink = new Regex(@"(next|weiter|continue|dalej|następna|nastepna>([^\|]|$)|�([^\|]|$))", RegexOptions.Compiled | RegexOptions.IgnoreCase); + private static readonly Regex _ArticleTitleColonRegex2 = new Regex("[^:]*[:](.*)", RegexOptions.Compiled); + private static readonly Regex _ContinueLink = new Regex(@"(continue|continuer|continuar)", RegexOptions.Compiled | RegexOptions.IgnoreCase); + private static readonly Regex _NextLink = new Regex(@"(next|weiter|dalej|następna|nastepna>([^\|]|$)|�([^\|]|$))", RegexOptions.Compiled | RegexOptions.IgnoreCase); private static readonly Regex _NextStoryLink = new Regex("(story|article|news|document|post|note|series|historia|artykul|artykuł|wpis|dokument|seria|geschichte|erzählung|erzahlung|artikel|serie)", RegexOptions.Compiled | RegexOptions.IgnoreCase); private static readonly Regex _PrevLink = new Regex("(prev|earl|[^b]old|new|wstecz|poprzednia|<|�)", RegexOptions.Compiled | RegexOptions.IgnoreCase); private static readonly Regex _PageRegex = new Regex("pag(e|ing|inat)|([^a-z]|^)pag([^a-z]|$)", RegexOptions.Compiled | RegexOptions.IgnoreCase); @@ -442,14 +443,6 @@ internal string FindNextPageLink(XElement body, string url) continue; } - /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */ - string linkHrefLeftover = linkHref.Replace(articleBaseUrl, ""); - - if (!Regex.IsMatch(linkHrefLeftover, @"\d")) - { - continue; - } - if (!possiblePagesByLink.Keys.Contains(linkHref)) { possiblePagesByLink[linkHref] = new LinkData { Score = 0, LinkHref = linkHref, LinkText = linkText }; @@ -461,6 +454,12 @@ internal string FindNextPageLink(XElement body, string url) LinkData linkObj = possiblePagesByLink[linkHref]; + /* If the leftovers of the URL after removing the base URL don't contain any digits, it could still be the link, but the odds are lower. */ + if (!Regex.IsMatch(linkHref.Replace(articleBaseUrl, ""), @"\d")) + { + linkObj.Score -= 50; + } + /* * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html @@ -477,6 +476,11 @@ internal string FindNextPageLink(XElement body, string url) { linkObj.Score += 50; } + else if (_ContinueLink.IsMatch(linkData)) + { + /* Give "continue" links high weight since that usually implies a context of 'this article/story' */ + linkObj.Score += 100; + } if (_PageRegex.IsMatch(linkData)) {