Skip to content

Commit

Permalink
stricter href collection (closes #157)
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Feb 12, 2016
1 parent a761452 commit 6a1db5e
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion hyphe_backend/crawler/hcicrawler/linkextractor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# REWRITE REGEXP LINK EXTRACTOR FROM SCRAPY FOR BETTER PERFS
# WAS CHANGED:
# - linkre regexp (more generic)
Expand All @@ -14,7 +15,7 @@
from scrapy.link import Link
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

linkre = re.compile(r"href=(\"[^\">]+[\">]|'[^'>]+['>]|[^\s>]+[\s>])", re.DOTALL | re.IGNORECASE)
linkre = re.compile(r"<a[^>]*href\s*=\s*(\"[^\">]+[\">]|'[^'>]+['>]|[^\s>]+[\s>])", re.DOTALL | re.IGNORECASE)

def clean_link(link_text):
"""Remove leading and trailing whitespace and punctuation"""
Expand Down

0 comments on commit 6a1db5e

Please sign in to comment.