From 5b98b8f7f5ab274cff19aaadb5c2d31eac60bb2c Mon Sep 17 00:00:00 2001 From: sebclick Date: Wed, 8 Aug 2012 21:51:41 +0200 Subject: [PATCH 1/6] Correction Issue #3 Erreur dans l'init du tableau. --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 4b3eb97..c281107 100755 --- a/main.py +++ b/main.py @@ -139,7 +139,7 @@ def exclude_url(exclude, link): if response.getcode() in responseCode: responseCode[response.getcode()]+=1 else: - responseCode[response.getcode()] = 0 + responseCode[response.getcode()] = 1 if response.getcode()==200: msg = response.read() else: From 7d8a610bafdaa1b631262ab3dfa837a6170ef081 Mon Sep 17 00:00:00 2001 From: sebclick Date: Wed, 8 Aug 2012 22:21:27 +0200 Subject: [PATCH 2/6] =?UTF-8?q?Modification=20pour=20meilleure=20gestion?= =?UTF-8?q?=20des=20URL=20qui=20ne=20r=C3=A9pondent=20pas=20200=20OK?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Les URLS qui ne répondent pas 200 OK ne sont plus listés dans le sitemap. Les URLS qui ne répondent pas 200 OK ne sont vérifiés qu'une seule fois (car ajouté à la liste crawled). --- main.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index c281107..1a07693 100755 --- a/main.py +++ b/main.py @@ -133,6 +133,7 @@ def exclude_url(exclude, link): crawling = tocrawl.pop() url = urlparse(crawling) + crawled.add(crawling) try: request = Request(crawling, headers={"User-Agent":'Sitemap crawler'}) response = urlopen(request) @@ -143,7 +144,8 @@ def exclude_url(exclude, link): if response.getcode()==200: msg = response.read() else: - msg = "" + response.close() + continue response.close() except Exception as e: @@ -151,9 +153,9 @@ def exclude_url(exclude, link): logging.debug ("{1} ==> {0}".format(e, crawling)) continue - + print (""+url.geturl()+"", file=output_file) + output_file.flush() links = linkregex.findall(msg) - crawled.add(crawling) for link in links: link = link.decode("utf-8") if link.startswith('/'): @@ -173,7 +175,6 @@ def exclude_url(exclude, link): target_extension = os.path.splitext(parsed_link.path)[1][1:] if (link not in crawled) and (link not in tocrawl) and (domain_link == target_domain) and can_fetch(arg.parserobots, rp, link,arg.debug) and ("javascript:" not in link) and (target_extension not in arg.skipext) and (exclude_url(arg.exclude, link)): - print (""+link+"", file=output_file) tocrawl.add(link) print (footer, file=output_file) From ac8f453d92fb131aa31da95e3204ab3e5d70982b Mon Sep 17 00:00:00 2001 From: sebclick Date: Sat, 11 Aug 2012 15:18:54 +0200 Subject: [PATCH 3/6] =?UTF-8?q?Ajout=20param=C3=A8tre=20"drop"=20et=20ajou?= =?UTF-8?q?t=20de=20log?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Le paramètre drop permet de supprimer un attribut d'une URL avec une expression régulière. Ajout de log pour avoir le nombre d'erreurs HTTP par code et le nombre d'url bloquées par le fichier robots.txt --- main.py | 73 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index 1a07693..c067395 100755 --- a/main.py +++ b/main.py @@ -47,23 +47,24 @@ def exclude_url(exclude, link): parser.add_argument('--debug', action="store_true", default=False, help="Enable debug mode") parser.add_argument('--output', action="store", default=None, help="Output file") parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain") +parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url") group = parser.add_mutually_exclusive_group() group.add_argument('--config', action="store", default=None, help="Configuration file in json format") group.add_argument('--domain', action="store", default="", help="Target domain (ex: http://blog.lesite.us)") arg = parser.parse_args() - # Read the config file if needed if arg.config is not None: try: config_data=open(arg.config,'r') config = json.load(config_data) config_data.close() - except: + except Exception as e: if arg.debug: logging.debug ("Bad or unavailable config file") config = {} + print(e) else: config = {} @@ -128,33 +129,52 @@ def exclude_url(exclude, link): rp.read() responseCode={} +nbUrl=0 +nbRp=0 print (header, file=output_file) while tocrawl: crawling = tocrawl.pop() + url = urlparse(crawling) crawled.add(crawling) + try: request = Request(crawling, headers={"User-Agent":'Sitemap crawler'}) + # TODO : The urlopen() function has been removed in Python 3 in favor of urllib2.urlopen() response = urlopen(request) + except Exception as e: + if hasattr(e,'code'): + if e.code in responseCode: + responseCode[e.code]+=1 + else: + responseCode[e.code]=1 + #else: + # responseCode['erreur']+=1 + if arg.debug: + logging.debug ("{1} ==> {0}".format(e, crawling)) + response.close() + continue + + # Read the response + try: + msg = response.read() if response.getcode() in responseCode: responseCode[response.getcode()]+=1 else: - responseCode[response.getcode()] = 1 - if response.getcode()==200: - msg = response.read() - else: - response.close() - continue - + responseCode[response.getcode()]=1 response.close() except Exception as e: if arg.debug: - logging.debug ("{1} ==> {0}".format(e, crawling)) + logging.debug ("{1} ===> {0}".format(e, crawling)) continue + print (""+url.geturl()+"", file=output_file) - output_file.flush() + if output_file: + output_file.flush() + + # Found links links = linkregex.findall(msg) for link in links: link = link.decode("utf-8") @@ -169,17 +189,44 @@ def exclude_url(exclude, link): if "#" in link: link = link[:link.index('#')] + # Drop attributes if needed + if arg.drop is not None: + for toDrop in arg.drop: + link=re.sub(toDrop,'',link) + # Parse the url to get domain and file extension parsed_link = urlparse(link) domain_link = parsed_link.netloc target_extension = os.path.splitext(parsed_link.path)[1][1:] - if (link not in crawled) and (link not in tocrawl) and (domain_link == target_domain) and can_fetch(arg.parserobots, rp, link,arg.debug) and ("javascript:" not in link) and (target_extension not in arg.skipext) and (exclude_url(arg.exclude, link)): - tocrawl.add(link) + if (link in crawled): + continue + if (link in tocrawl): + continue + if (domain_link != target_domain): + continue + + # Count one more URL + nbUrl+=1 + + if (can_fetch(arg.parserobots, rp, link, arg.debug) == False): + nbRp+=1 + continue + if ("javascript" in link): + continue + if (target_extension in arg.skipext): + continue + if (exclude_url(arg.exclude, link)==False): + continue + + tocrawl.add(link) print (footer, file=output_file) if arg.debug: + logging.debug ("Number of found URL : {0}".format(nbUrl)) logging.debug ("Number of link crawled : {0}".format(len(crawled))) + if arg.parserobots: + logging.debug ("Number of link block by robots.txt : {0}".format(nbRp)) for code in responseCode: logging.debug ("Nb Code HTTP {0} : {1}".format(code, responseCode[code])) From 26b23d7d83324fa0a3855d9c319f29ce1754e367 Mon Sep 17 00:00:00 2001 From: sebclick Date: Sat, 11 Aug 2012 15:21:12 +0200 Subject: [PATCH 4/6] =?UTF-8?q?Ajout=20doc=20pour=20param=C3=A8tre=20drop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ajout d'un exemple pour expliquer la configuration du paramètre drop. --- .main.py.swp | Bin 0 -> 16384 bytes README.md | 6 +++++- 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 .main.py.swp diff --git a/.main.py.swp b/.main.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..a206e7af729e77923381ec52250cac06925fb945 GIT binary patch literal 16384 zcmeI3U5q4E6@W_+p#gzF6cVG{t%-CuGt)D(%LZ(=2bN|3bpPC)A%SGzh?t1qxmDFuJw3ay zhM17lB;R&d)%`v9+;h%7U0a{NW&c5SZDp&)-zApy)U8(?k*_yiW?4r9zZbM$j+}O& z<=(0O4(>d#|Ipsur+PfNzq>khP>0I(8~v{KqHt<&JKEQ;PxaQKmhVl5?MQc>-c;9V zdzIeWpj!SVmq0FocSQmhS=+YGOpn=H)Mb~>zbh@~jpP!@C6G%Xmq0FoTmrcSatY)T z$R+UJkwDZt%X*99o&_i2eFA`1{Yd-S7TmrcSatY)T$R&_VAeTTcfm{N)1ab-F z638WxOW-{!0mrqh%NhHXk^tcMf5`#7ez9de2Va9VScMkc4vX+*v!KOu}0Q%laye!_{yJ{Ovr;dLDiakHCEp!UB8>E{6jA=UmHr z173w!;8}PCz6@)y7p{OmeE@mEqwpZC!CkNqcET0#AvhQQaE@jD5WWQu!u@bBoPc>a z0N25J@bC8{ANUD80}sLj;KBrq!y6=O{ti#U-Ea;3m9=~YUV!J}H}DXA6YhZ%a2G7X zHjuUd7;N%aC^*4##r5T#3^iU;_LOZE+D)~7ccG)b()wk2a%NF&F$JquQn2R)p$`1I zAB9yb=&71I9k$m5Rv}uvZYrWx))WgLPQTbT1o~| zQ~jW0TFv&nFtcby!CKX_n!X?DfPU;9QJS!+Y2MdYv^Q2~h#05$neOz~H>64px z-!#9sobJ?gA_e4;me4`KVwLQ9?@rHO@l>9!lu5@TBc^McnR zQ*W42t>4N@zwl0LtoN~bmARm=Wd)JtXp^$gy2GPrt*1+)&&!piCC};Vr6pCXsbbyt zJ4K@$qeUxvHPW72+BhksAbr2H4m#~Hk`Cg9H-ZvzroC0MUV+%x!IQYRs+E>Nsyt@Y z=Ef4+>o9w_<2!C5ZKL?{I$XeF;ld(%GPAI1l`+zX=18q+zS2vx0C#um7En`MT_1_RwPEGDVTw z9v&IxiG#RGv?{uDiXnt|wtLvgO!sG?4|~L68Y?TYh>8dpXpYX3-eTKldamqDcHt2d zUX}*&%XDSNo^W;82--cfC5yXEOcAJ_-w#yBQH?;yUJ^x+J*Rl>80#^n_Uce-Fy&Cu ziI~nni|H%`I_d|WiakwgJcn*#GLvhwc+?RSPq2jCBytj;BBnjQlPGcQ2L>02n&(DC zm)HZC2=jfyG*Bk8-8hnUHyvw7$=+?;wc9rvN(3bvYiQkKKbeFq1-yr+VjnM?K?0{O z*+?ckN10Rh+O#A=kXev4!k1EMv4JQ?2Wl2Y2In``iRqI#z);lORrR&(cq_53Dv1co z;sl0=p9u|yM<&7#dXhFG6k?-y|}g*cJ^Ux z#$-v_-JTyrDhZd;OEI&9XMUL8#sipEIirZJV~soQdhBOZ(h&FJBH0o~w()q|VT9hWmT~6Tq%>8sUyw)%0|9dzy?&G{E=YIxmeTVb>m!J!Ea6r!d z`$5kATi{Z7gERa;LC*faho8Z-@MGx174RnK^cUedcogo3&x4%p8?YO;g9U%#9RDKx z5q{|tN&o`fgBgGJa2d*E8Q9L|Q9$Ok+J55awK8(aq$ z!3FT=0_SCT5}tssz#7~RvrvQc;Z<@2zXZt}d;xmk!U4DuJ^~lRh432rfmh&Vco@C` z4?qMhxDl>_55Sw`2mS>w!B62?_&(eX+d;u^S*I7^N8mvh7D3i;FXVr@1ab-d|0R$; z@ej^4IGN0O9$zu)1rOELE+RU$Z*K00af4NLV&Ltplw6Qoi{c3- zn2-w-m}N?A-cP?uE_D;D!f1d-bC|>4iDqv|<;PdiX(ZAPqVW z4Y!`=aE4oyL}BuzEG?%AhS5fve&2J|{Lup=W10^-R87WYDl%JhhE=6F>o;M7)>=mH z5;P<^Bssmsk%X*;oOn&Z6UPH-v}GbMne%Y;RU5ec(V9tmay@V*=93nZSeP(jw2plT z2kAISwbK}Jq+mgoVZ1hzjBq_R*{~+r_(UEV(Uc{jVInYzQwp?&_~ORK(7L`yYRn_} zl}I09Q(}{0#^>85qc!AphZ3_C$fd{WTU5$aBnOl=MlX`q!TM1fG)gi`4VR8Ak6^SXmd znnUzpm5mM+L^3tfp<%yXihMcHP85q1CeRdd$}Wv>i%`tkP$UUK(@Qm!q>z!s2jhry zalIdL8bPv2hC&)Ot5Gvsd7&y76>_P1wNxA{mPHnnw#NoqC7ns)qw62nW^yEvTMP?v z5F!p5j+Z}ax|n1pifUX0g_!wLoRUi^L5n7*Cc(eypUiX+g_3bCnX1Kj-A~t=G`m%r z%{DmF7&9`v=X5P44KS6F>Qiy`q_%nNYO$InyB{@4Y-1f{iA(_&NIv!ki3z>ltYkARFtMrKcwxtKgZM3CP$5tpHeO6^qmV>-2f;OpiqvT0; zFD5(*!3{X$WUdm5X zVeN!H&(DUFJD2&t;&!!oEdKA|u^#K7cyh4pWVYiUjf{IA=($y($E=-Se}Yz|7ZgZ? zgPz|*msVPmgG%mI)8HZ=)MoopJalc^GR~FHRKsL<)JU{j1+oySn(~!jwFUu3?6St~ eIIc?4YLsV88&3D4>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml +Drop attribute from url (regexp) : + + >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --drop "id=[0-9]{5}" + Exclude url by filter a part of it : >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --exclude "action=edit" Read the robots.txt to ignore some url: - >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots \ No newline at end of file + >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots From 3e2f77c54708291feff4a0c2812f3abf126aa66a Mon Sep 17 00:00:00 2001 From: sebclick Date: Sat, 11 Aug 2012 15:24:23 +0200 Subject: [PATCH 5/6] Suppression fichier swap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commité par erreur, je le supprime. --- .main.py.swp | Bin 16384 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .main.py.swp diff --git a/.main.py.swp b/.main.py.swp deleted file mode 100644 index a206e7af729e77923381ec52250cac06925fb945..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeI3U5q4E6@W_+p#gzF6cVG{t%-CuGt)D(%LZ(=2bN|3bpPC)A%SGzh?t1qxmDFuJw3ay zhM17lB;R&d)%`v9+;h%7U0a{NW&c5SZDp&)-zApy)U8(?k*_yiW?4r9zZbM$j+}O& z<=(0O4(>d#|Ipsur+PfNzq>khP>0I(8~v{KqHt<&JKEQ;PxaQKmhVl5?MQc>-c;9V zdzIeWpj!SVmq0FocSQmhS=+YGOpn=H)Mb~>zbh@~jpP!@C6G%Xmq0FoTmrcSatY)T z$R+UJkwDZt%X*99o&_i2eFA`1{Yd-S7TmrcSatY)T$R&_VAeTTcfm{N)1ab-F z638WxOW-{!0mrqh%NhHXk^tcMf5`#7ez9de2Va9VScMkc4vX+*v!KOu}0Q%laye!_{yJ{Ovr;dLDiakHCEp!UB8>E{6jA=UmHr z173w!;8}PCz6@)y7p{OmeE@mEqwpZC!CkNqcET0#AvhQQaE@jD5WWQu!u@bBoPc>a z0N25J@bC8{ANUD80}sLj;KBrq!y6=O{ti#U-Ea;3m9=~YUV!J}H}DXA6YhZ%a2G7X zHjuUd7;N%aC^*4##r5T#3^iU;_LOZE+D)~7ccG)b()wk2a%NF&F$JquQn2R)p$`1I zAB9yb=&71I9k$m5Rv}uvZYrWx))WgLPQTbT1o~| zQ~jW0TFv&nFtcby!CKX_n!X?DfPU;9QJS!+Y2MdYv^Q2~h#05$neOz~H>64px z-!#9sobJ?gA_e4;me4`KVwLQ9?@rHO@l>9!lu5@TBc^McnR zQ*W42t>4N@zwl0LtoN~bmARm=Wd)JtXp^$gy2GPrt*1+)&&!piCC};Vr6pCXsbbyt zJ4K@$qeUxvHPW72+BhksAbr2H4m#~Hk`Cg9H-ZvzroC0MUV+%x!IQYRs+E>Nsyt@Y z=Ef4+>o9w_<2!C5ZKL?{I$XeF;ld(%GPAI1l`+zX=18q+zS2vx0C#um7En`MT_1_RwPEGDVTw z9v&IxiG#RGv?{uDiXnt|wtLvgO!sG?4|~L68Y?TYh>8dpXpYX3-eTKldamqDcHt2d zUX}*&%XDSNo^W;82--cfC5yXEOcAJ_-w#yBQH?;yUJ^x+J*Rl>80#^n_Uce-Fy&Cu ziI~nni|H%`I_d|WiakwgJcn*#GLvhwc+?RSPq2jCBytj;BBnjQlPGcQ2L>02n&(DC zm)HZC2=jfyG*Bk8-8hnUHyvw7$=+?;wc9rvN(3bvYiQkKKbeFq1-yr+VjnM?K?0{O z*+?ckN10Rh+O#A=kXev4!k1EMv4JQ?2Wl2Y2In``iRqI#z);lORrR&(cq_53Dv1co z;sl0=p9u|yM<&7#dXhFG6k?-y|}g*cJ^Ux z#$-v_-JTyrDhZd;OEI&9XMUL8#sipEIirZJV~soQdhBOZ(h&FJBH0o~w()q|VT9hWmT~6Tq%>8sUyw)%0|9dzy?&G{E=YIxmeTVb>m!J!Ea6r!d z`$5kATi{Z7gERa;LC*faho8Z-@MGx174RnK^cUedcogo3&x4%p8?YO;g9U%#9RDKx z5q{|tN&o`fgBgGJa2d*E8Q9L|Q9$Ok+J55awK8(aq$ z!3FT=0_SCT5}tssz#7~RvrvQc;Z<@2zXZt}d;xmk!U4DuJ^~lRh432rfmh&Vco@C` z4?qMhxDl>_55Sw`2mS>w!B62?_&(eX+d;u^S*I7^N8mvh7D3i;FXVr@1ab-d|0R$; z@ej^4IGN0O9$zu)1rOELE+RU$Z*K00af4NLV&Ltplw6Qoi{c3- zn2-w-m}N?A-cP?uE_D;D!f1d-bC|>4iDqv|<;PdiX(ZAPqVW z4Y!`=aE4oyL}BuzEG?%AhS5fve&2J|{Lup=W10^-R87WYDl%JhhE=6F>o;M7)>=mH z5;P<^Bssmsk%X*;oOn&Z6UPH-v}GbMne%Y;RU5ec(V9tmay@V*=93nZSeP(jw2plT z2kAISwbK}Jq+mgoVZ1hzjBq_R*{~+r_(UEV(Uc{jVInYzQwp?&_~ORK(7L`yYRn_} zl}I09Q(}{0#^>85qc!AphZ3_C$fd{WTU5$aBnOl=MlX`q!TM1fG)gi`4VR8Ak6^SXmd znnUzpm5mM+L^3tfp<%yXihMcHP85q1CeRdd$}Wv>i%`tkP$UUK(@Qm!q>z!s2jhry zalIdL8bPv2hC&)Ot5Gvsd7&y76>_P1wNxA{mPHnnw#NoqC7ns)qw62nW^yEvTMP?v z5F!p5j+Z}ax|n1pifUX0g_!wLoRUi^L5n7*Cc(eypUiX+g_3bCnX1Kj-A~t=G`m%r z%{DmF7&9`v=X5P44KS6F>Qiy`q_%nNYO$InyB{@4Y-1f{iA(_&NIv!ki3z>ltYkARFtMrKcwxtKgZM3CP$5tpHeO6^qmV>-2f;OpiqvT0; zFD5(*!3{X$WUdm5X zVeN!H&(DUFJD2&t;&!!oEdKA|u^#K7cyh4pWVYiUjf{IA=($y($E=-Se}Yz|7ZgZ? zgPz|*msVPmgG%mI)8HZ=)MoopJalc^GR~FHRKsL<)JU{j1+oySn(~!jwFUu3?6St~ eIIc?4YLsV88&3D4 Date: Sat, 11 Aug 2012 15:33:58 +0200 Subject: [PATCH 6/6] Ajout d'une liste d'url exclue Cette nouvelle liste permet d'"optimiser" le traitement et de mieux comptabiliser les URL --- main.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index c067395..f06e43e 100755 --- a/main.py +++ b/main.py @@ -102,6 +102,7 @@ def exclude_url(exclude, link): tocrawl = set([arg.domain]) crawled = set([]) +excluded = set([]) # TODO also search for window.location={.*?} linkregex = re.compile(b'') @@ -129,7 +130,7 @@ def exclude_url(exclude, link): rp.read() responseCode={} -nbUrl=0 +nbUrl=1 nbRp=0 print (header, file=output_file) while tocrawl: @@ -203,20 +204,28 @@ def exclude_url(exclude, link): continue if (link in tocrawl): continue + if (link in excluded): + continue if (domain_link != target_domain): continue + if ("javascript" in link): + continue # Count one more URL nbUrl+=1 if (can_fetch(arg.parserobots, rp, link, arg.debug) == False): + if link not in excluded: + excluded.add(link) nbRp+=1 continue - if ("javascript" in link): - continue if (target_extension in arg.skipext): + if link not in excluded: + excluded.add(link) continue if (exclude_url(arg.exclude, link)==False): + if link not in excluded: + excluded.add(link) continue tocrawl.add(link)