1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
| from bs4 import BeautifulSoup from urllib.request import urlopen from urllib import error import requests import re import time import os import openpyxl
path = r'C:/data/data.xlsx' root = r'C:' start = 1 end = 21914
def initDocs(): os.mkdir(root + './data') fullpath = 'C:/data/' + 'data.xlsx' file = open(fullpath, 'w') file.close()
def initDownload(): os.mkdir(root + './data/download')
def addRow(number, formula, spaceGroup, pageUrl, downloadUrl): wb = openpyxl.load_workbook(path) ws = wb.active ws.append([number, formula, spaceGroup, pageUrl, downloadUrl]) wb.save(path)
def initializationTable(): sheetStr = 'data' workbook = openpyxl.Workbook() sheet = workbook.active sheet.title = sheetStr sheet.append(['序号', 'formula', 'spaceGroup', '页面链接', '下载链接']) workbook.save(path) print("数据表初始化完成")
def downloadFile(number_of_pages, formula, downloadUrl): myfile = requests.get(downloadUrl) open('C:/data/download/' + 'POSCAR-' + number_of_pages + '-' + formula, 'wb').write(myfile.content) print(number_of_pages,'下载完成')
if __name__ == '__main__': for i in range(start, end): pageUrl = 'https://materialsweb.org/database/' + str(i) try: html = urlopen(pageUrl) except error.HTTPError as e: addRow(i, "网页异常", '网页异常', pageUrl, '网页异常') print(i, 'http请求错误:' + str(e), "错误已写入文件") continue print(i, "开始爬取") soup = BeautifulSoup(html, 'html.parser') fileName = "".join(soup.title.text.split()) resp = requests.get(pageUrl) text = resp.text try: number_of_pages = fileName.split('-')[0] formula = fileName.split('-')[1] spacegroup = re.findall(r"spacegroup = \"(.*)\"", text)[0] item = re.findall(r"/static/database/(.*)\"", text) except: formula = '网页异常' spacegroup = '网页异常' downloadUrl = '网页异常' continue downloadUrl = "https://materialsweb.org/static/database/" + item[ 0] + "/POSCAR" addRow(number_of_pages, formula, spacegroup, pageUrl, downloadUrl) print(i, "写入完成") downloadFile(number_of_pages, formula, downloadUrl) time.sleep(1) print("休眠完成继续爬取\n")
|