garlopy

A Scrapely clone (machine learning HTML web scraping from examples and basic machine learning) using BeautifulSoup

Example

from garlopy import GarlopyScraper
import unittest


class TestGarlopy(unittest.TestCase):

    def setUp(self):
        self.seq = range(10)
        self.s = GarlopyScraper()
        
    def _test_scraper(self,html1,data1,html2,data2):
        
        s = self.s

        s.train_html(html1, data1)
        
        result = s.scrape_html(html2)
        #print result
        data2_scraped = result # result[0]
        
        print data2
        print data2_scraped
        print '+'*80
        #print Counter(data2_scraped['name'])
    
        if 'name' in data2_scraped:
            data2_scraped['name'] = [e.strip() for e in data2_scraped['name'] ]
        if 'venue_name' in data2_scraped:
            data2_scraped['venue_name'] = data2_scraped['venue_name']!=None and [e.strip() for e in data2_scraped['venue_name'] ] or None
        if 'date' in data2_scraped:
            data2_scraped['date'] = [e.strip() for e in data2_scraped['date'] ]

        print '*'*80
        print data2
        print '-'*80
        print data2_scraped
        print '*'*80

        self.assertEqual(data2, data2_scraped)
        

    def test_basic(self):
        # make sure the shuffled sequence does not lose any elements
        html1 = '''
        <html>
        <p>Hector</p>
        </html>
        '''
        html2 = '''
        <html>
        <p>Jorge</p>
        </html>
        '''
        data1 = {'name':'Hector'}
        data2 = {'name':['Jorge']}

        self._test_scraper(html1, data1, html2, data2)

Name		Name	Last commit message	Last commit date
Latest commit History 6 Commits
BeautifulSoup.py		BeautifulSoup.py
LICENSE.md		LICENSE.md
README.md		README.md
__init__.py		__init__.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

garlopy

Example

About

Releases

Packages

Languages

License

jio-gl/garlopy

Folders and files

Latest commit

History

Repository files navigation

garlopy

Example

About

Topics

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages