一:程序测试
def gameOver(N,z,y): if(z-y==2 and z==20 and y==20): return True elif(y-z==2 and z==20 and y==20): return True elif(z==29 and y==29): if(z-y==1 or y-z==1): return True else: return Falsedef Test(): try: N = [1, 2, 3, 4, 5, 5, 5] z = [13, 19, 20, 21, 14, 17, 15] y = [15, 18, 20, 23, 16, 16, 0] result = ["True", "False", "False", "True", "True", "True", "True"] for i in range(0, 7): if str(gameOver(N[i], z[i], y[i])) == result[i]: print("Test {}: Right Result!".format(i+1)) else: print("Test {}: Error Result!".format(i+1)) except Exception as e: print("Error:", e)Test()
二:打开360网页
import requestsdef getHTMLText(url): try: r=requests.get(url,timeout=30) from bs4 import BeautifulSoup soup=BeautifulSoup(r.text) r.raise_for_status() r.encoding='utf_8' return (r.text,r.status_code,r.encoding,len(r.text),len(soup.p.contents),type(soup)) except: return""url='https://hao.360.cn'for i in range(20): print(i) print(getHTMLText(url))
三:html计算
# -*- coding: utf-8 -*-"""Created on Mon May 20 10:03:00 2019@author: 27"""from requests import getdef getText(url): try: r = get(url, timeout=5) r.raise_for_status() r.encoding = 'utf-8' return r.text except Exception as e: print("Error:", e) return ''from bs4 import BeautifulSoup url = "https://www.runoob.com/" html = getText(url) soup = BeautifulSoup(html)print("head:", soup.head)print("head:", len(soup.head))print("body:", soup.body)print("body:", len(soup.body))print("title:",soup.title)print("special_id:", soup.find(id='cd-login')import redef getACH(text): text_unicode = text.strip() string = re.compile('[^\u4e00-\u9fff]') ACH = "".join(string.split(text_unicode)) return ACHprint("ACH:", ACH(html))
四:中国大学排名(爬取年费2018)
# -*- coding: utf-8 -*- """ Created on Thu May 23 08:39:07 2019
@author: 27 """ import requests from bs4 import BeautifulSoup import pandas as pd allUniv=[] def getHTMLText(url): try: r=requests.get(url,timeout=30) r.raise_for_status() r.encoding="utf-8" return r.text except: return "" ####代码中每个td标签包含大学排名表格的夜歌列数值,与表头一一对应。要获取其中数据要先找到<tr></tr>标签,并遍历其中每一个<td></td>标签,获取其值写入程序的数据结构中########## def fillUnivList(soup): data=soup.find_all('tr') #找到所有的tr标签 for tr in data: ltd=tr.find_all('td') #在每个tr标签中找到所有的td标签 if len(ltd)==0: continue singleUniv=[] #创建空列表对象,储存当前<tr>标签表示大学的数据 for td in ltd: singleUniv.append(td.string) #提取td标签中的信息 allUniv.append(singleUniv) def printUnivList(num): print("{1:^2}{2:{0}^10}{3:{0}^6}{4:{0}^4}{5:{0}^10}".format(chr(12288),"排名","学校名称","省市","总分","年费")) a=[] for i in range(num): u=allUniv[i] print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^10}".format(chr(12288),u[0],u[1],u[2],u[3],u[6])) def main(num): url='http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html' html=getHTMLText(url) soup=BeautifulSoup(html,"html.parser") fillUnivList(soup) printUnivList(num) main(30) list=allUniv name=["paiming","学校名称","省市","总分","生涯质量", "培养结果", "社会声誉", "科研规模", "科研质量", "顶尖成果" ,"顶尖人才", "科技服务", "成果转化","学生国际化"] test=pd.DataFrame(columns=name,data=list) test.to_csv('university.csv',encoding='gbk')
五:函数介绍总结
1.
函数 | 说明 |
get(url [, timeout=n]) | 对应HTTP的GET方式,设定请求超时时间为n秒 |
post(url, data={'key':'value'}) | 对应HTTP的POST方式,字典用于传输客户数据 |
delete(url) | 对应HTTP的DELETE方式 |
head(url) | 对应HTTP的HEAD方式 |
options(url) | 对应HTTP的OPTIONS方式 |
put(url, data={'key':'value'}) | 对应HTTP的PUT方式,字典用于传输客户数据 |
2.
属性 | 说明 |
status_code | HTTP请求的返回状态 |
encoding | HTTP响应内容的编码方式 |
text | HTTP响应内容的字符串形式 |
content | HTTP响应内容的二进制形式 |
方法 | 说明 |
json() | 若http响应内容中包含json格式数据, 则解析json数据 |
raise_for_status() | 若http返回的状态码不是200, 则产生异常 |