资源简介
NBA数据爬虫
代码片段和文件信息
import urllib2
from bs4 import BeautifulSoup
import re
import xlrd
import xdrlibsys
import xlwt
def transformCodec(re_data):#ascii (gbk) ת unicode
try:
re_data = re_data.decode(‘gbk‘)
except Exception as error:
print error
print ‘delete illegal stringtry again...‘
pos = re.findall(r‘decodebytesinposition([\d]+)-([\d]+):illegal‘str(error).replace(‘ ‘‘‘))
if len(pos)==1:
re_data = re_data[0:int(pos[0][0])]+re_data[int(pos[0][1]):]
re_data = transformCodec(re_data)
return re_data
return re_data
file=xlwt.Workbook()
table=file.add_sheet(‘shuju‘cell_overwrite_ok=True)
table.write(00‘team‘)
table.write(01‘W/L‘)
table.write(02‘Strk‘)
table.write(03‘Home‘)
table.write(04‘Away‘)
table.write(05‘Day‘)
table.write(06‘Night‘)
table.write(07‘Div‘)
table.write(08‘Conf‘)
row=1
col=0
for page in range(128):
print page
url=“http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/nba/matchups/g5_preview_“+str(page)+“.html“
response=urllib2.urlopen(url)
print response.getcode()
soup=BeautifulSoup(
response
‘html.parser‘
from_encoding=‘utf-8‘
)
links2=soup.find_all(‘div‘class_=“sdi-so“limit=2)
cishu=0
for i in links2:
if(cishu==1):
two=i.find_all(‘td‘class_=“sdi-datacell“)
for q in two:
print q.text
table.write(rowcolq.text)
col=(col+1)%9
if(col==0):
row=row+1
row=row+1
file.save(‘NBA.xls‘)
cishu=cishu+1
file.save(‘NBA.xls‘)
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
目录 0 2017-04-10 06:23 PythonApplication1\
目录 0 2017-04-15 16:38 PythonApplication1\PythonApplication1\
文件 859 2017-04-10 06:23 PythonApplication1\PythonApplication1.sln
文件 18944 2017-05-03 03:01 PythonApplication1\PythonApplication1.v12.suo
文件 13824 2017-04-24 00:04 PythonApplication1\PythonApplication1\NBA.xls
文件 1859 2017-04-15 16:38 PythonApplication1\PythonApplication1\PythonApplication1.py
文件 1953 2017-04-10 06:23 PythonApplication1\PythonApplication1\PythonApplication1.pyproj
- 上一篇:人民日报语料1.rar
- 下一篇:SSR_for_win.zip
评论
共有 条评论