craw_Qingy.py

3.54 KB / 2021-07-15 18:57:29
    import time
from urllib.parse import unquote
from lxml import etree
from selenium import webdriver  # 用来驱动浏览器的
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import os
import time


def main():
    driver = webdriver.Chrome()
    WebDriverWait(driver, 10)
    driver.get('http://wiki.xypbk.com/')
    # 关键点！其实简单只在这里处理即可。
    # for cook in cookies:
    #     # 遍历删除sameSite,注意，旧版chrome可能是没有samesite
    #     try:
    #         cook.pop('sameSite')
    #     except:
    #         pass
    #     driver.add_cookie(cook)
    input("请手动登录后按Enter键继续")
    # 登录后刷新页面
    driver.refresh()
    # 将获取的html字符串转换为etree对象
    htmlObj = etree.HTML(driver.page_source)
    # 获取所有文件链接
    label_aList = htmlObj.xpath('//*[@class="file"]/a/@href')
    for url in label_aList:
        absFileName = ""
        try:
            pathList = url.split("/")
            absPath = os.getcwd()
            for indexNum in range(3, len(pathList)):
                if indexNum == len(pathList)-1:
                    absPath = os.path.join(absPath, pathList[indexNum].split(".md")[0]).replace(">", "").replace("<", "").strip(" ").replace("::", "").replace("*", "")
                    absFileName = os.path.join(absPath, pathList[indexNum]).replace(">", "").replace("<", "").strip(" ").replace("::", "").replace("*", "")
                else:
                    absPath = os.path.join(absPath, pathList[indexNum])
            if not os.path.isdir(absPath):
                os.makedirs(absPath)
        except Exception as e:
            #print(e)
            #print("[-]保存失败：{}".format(url))
            continue
        if not os.path.isfile(absFileName):
            print("[+]正在保存：{}".format(url))
            try:
                driver.get(url)
                htmlObj = etree.HTML(driver.page_source)  # 将获取的html字符串转换为etree对象
                texIinfo = htmlObj.xpath('//textarea/text()')[0].encode("utf-8")  # 获取文本内容
                # 获取图片链接列表
                imgList = htmlObj.xpath('//*[@id="render"]/p/img/@src')
                with open(absFileName, 'wb') as f:
                    f.write(texIinfo)
            except Exception as e:
                #print(e)
                print("[-]文件创建失败：{}".format(absFileName))
                continue
            for imgs in imgList:
                try:
                    if ":" in imgs:
                        continue
                    imgs = imgs.replace("./resource", "resource")
                    imgUrl = "http://wiki.xypbk.com/" + imgs
                    fileName = os.path.join(absPath, unquote(imgs).replace("/", "\\").lstrip("\\")).replace(">", "").replace("<", "").replace("::", "").replace("*", "")
                    filePath = "\\".join(fileName.split("\\")[:-1]).replace(">", "").replace("<", "").replace("::", "").replace("*", "")
                    if not os.path.isdir(filePath):
                        os.makedirs(filePath)
                    driver.get(imgUrl)
                    driver.save_screenshot(fileName)
                except Exception as e:
                    #print(e)
                    #print("[-]图片下载失败：{}".format(url))
                    os.remove(absFileName)
                    continue
    time.sleep(3)
    driver.close()


if __name__ == '__main__':
    main()