Skip to content

Latest commit

 

History

History
54 lines (43 loc) · 1.36 KB

一个图片爬虫例子.md

File metadata and controls

54 lines (43 loc) · 1.36 KB
import requests
import re
import os
import time

headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

"""请求网页"""
#response = requests.get("https://www.vmgirls.com/15198.html", headers=headers)
response = requests.get("https://www.vmgirls.com/15270.html", headers=headers)
html = response.text
# print(html)

"""解析网页"""
#dir_name = re.findall('<title>(.*?)</title>', html)[-1]
dir_name = re.findall('<h1 class="post-title h1">(.*?)</h1>',html)[-1]
if not os.path.exists(dir_name):
    os.mkdir(dir_name)

urls = re.findall('<a href="(.*?)" alt=".*?" title=".*?">', html)
print(urls)

"""保存图片"""
for url in urls:
    time.sleep(2)
    file_name = url.split('/')[-1]  # 文件命名
    print("https:" + url)
    response = requests.get("https:" + url, headers=headers)
    with open(dir_name + '/' + file_name, 'wb') as f:
        f.write(response.content)
        f.close()
        

# 遍历目录
def walkFile(file):
    list = []
    for root, dirs, files in os.walk(file):
        for f in files:
            name = "{}".format(f).split('.')[-2]
            list.append("* [{}](计算机网络/{})".format(name, f))
    return list


if __name__ == '__main__':
    file = "C:\\Users\zpparts\Desktop\SoleilNotes\计算机网络"
    list = walkFile(file)
    for v in list:
        print(v)