Python实现爬取百度贴吧帖子所有楼层图片的爬虫示例

日期: 2019-12-12 22:21 浏览次数 :

本文实例讲述了Python实现爬取百度贴吧帖子所有楼层图片的爬虫。分享给大家供大家参考,具体如下:

本文实例讲述了Python实现爬虫爬取NBA数据功能。分享给大家供大家参考,具体如下:

本文实例讲述了Python爬虫实现全国失信被执行人名单查询功能。分享给大家供大家参考,具体如下:

下载百度贴吧帖子图片,好好看

爬取的网站为:stat-nba.com,这里爬取的是NBA2016-2017赛季常规赛至2017年1月7日的数据

一、需求说明

图片 1

改变url_header和url_tail即可爬取特定的其他数据。

利用百度的接口,实现一个全国失信被执行人名单查询功能。输入姓名,查询是否在全国失信被执行人名单中。

python2.7版本:

源代码如下:

图片 2

#coding=utf-8
import re
import requests
import urllib
from bs4 import BeautifulSoup
import time
time1=time.time()
def getHtml(url):
  page = requests.get(url)
  html =page.text
  return html
def getImg(html):
  soup = BeautifulSoup(html, 'html.parser')
  img_info = soup.find_all('img', class_='BDE_Image')
  global index
  for index,img in enumerate(img_info,index+1):
    print ("正在下载第{}张图片".format(index))
    urllib.urlretrieve(img.get("src"),'C:/pic4/%s.jpg' % index)
def getMaxPage(url):
  html = getHtml(url)
  reg = re.compile(r'max-page="(d+)"')
  page = re.findall(reg,html)
  page = int(page[0])
  return page
if __name__=='__main__':
  url  = "https://tieba.baidu.com/p/5113603072"
  page = getMaxPage(url)
  index = 0
  for i in range(1,page):
    url = "%s%s" % ("https://tieba.baidu.com/p/5113603072?pn=",str(i))
    html = getHtml(url)
    getImg(html)
  print ("OK!All DownLoad!")
  time2=time.time()
  print u'总共耗时:' + str(time2 - time1) + 's'
#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
import time
import urllib
from bs4 import BeautifulSoup
import re
from pyExcelerator import *
def getURLLists(url_header,url_tail,pages):
  """
  获取所有页面的URL列表
  """
  url_lists = []
  url_0 = url_header+'0'+url_tail
  print url_0
  url_lists.append(url_0)
  for i in range(1,pages+1):
    url_temp = url_header+str(i)+url_tail
    url_lists.append(url_temp)
  return url_lists
def getNBAAllData(url_lists):
  """
  获取所有2017赛季NBA常规赛数据
  """
  datasets = ['']
  for item in url_lists:
    data1 = getNBASingleData(item)
    datasets.extend(data1)
  #去掉数据里的空元素
  for item in datasets[:]:
    if len(item) == 0:
      datasets.remove(item)
  return datasets
def getNBASingleData(url):
  """
  获取1个页面NBA常规赛数据
  """
  # url = 'http://stat-nba.com/query_team.php?QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017'
  # html = requests.get(url).text
  html = urllib.urlopen(url).read()
  # print html
  soup = BeautifulSoup(html)
  data = soup.html.body.find('tbody').text
  list_data = data.split('n')
  # with open('nba_data.txt','a') as fp:
  #   fp.write(data)
  # for item in list_data[:]:
  #   if len(item) == 0:
  #     list_data.remove(item)
  return list_data
def saveDataToExcel(datasets,sheetname,filename):
  book = Workbook()
  sheet = book.add_sheet(sheetname)
  sheet.write(0,0,u'序号')
  sheet.write(0,1,u'球队')
  sheet.write(0,2,u'时间')
  sheet.write(0,3,u'结果')
  sheet.write(0,4,u'主客')
  sheet.write(0,5,u'比赛')
  sheet.write(0,6,u'投篮命中率')
  sheet.write(0,7,u'命中数')
  sheet.write(0,8,u'出手数')
  sheet.write(0,9,u'三分命中率')
  sheet.write(0,10,u'三分命中数')
  sheet.write(0,11,u'三分出手数')
  sheet.write(0,12,u'罚球命中率')
  sheet.write(0,13,u'罚球命中数')
  sheet.write(0,14,u'罚球出手数')
  sheet.write(0,15,u'篮板')
  sheet.write(0,16,u'前场篮板')
  sheet.write(0,17,u'后场篮板')
  sheet.write(0,18,u'助攻')
  sheet.write(0,19,u'抢断')
  sheet.write(0,20,u'盖帽')
  sheet.write(0,21,u'失误')
  sheet.write(0,22,u'犯规')
  sheet.write(0,23,u'得分')
  num = 24
  row_cnt = 0
  data_cnt = 0
  data_len = len(datasets)
  print 'data_len:',data_len
  while(data_cnt< data_len):
    row_cnt += 1
    print '序号:',row_cnt
    for col in range(num):
        # print col
        sheet.write(row_cnt,col,datasets[data_cnt])
        data_cnt += 1
  book.save(filename)
def writeDataToTxt(datasets):
  fp = open('nba_data.txt','w')
  line_cnt = 1
  for i in range(len(datasets)-1):
    #球队名称对齐的操作:如果球队名字过短或者为76人队是 球队名字后面加两个table 否则加1个table
    if line_cnt % 24 == 2 and len(datasets[i]) < 5 or datasets[i] == u'费城76人':
      fp.write(datasets[i]+'tt')
    else:
      fp.write(datasets[i]+'t')
    line_cnt += 1
    if line_cnt % 24 == 1:
      fp.write('n')
  fp.close()
if __name__ == "__main__":
  pages = int(1132/150)
  url_header = 'http://stat-nba.com/query_team.php?page='
  url_tail = '&QueryType=game&order=1&crtcol=date_out&GameType=season&PageNum=3000&Season0=2016&Season1=2017#label_show_result'
  url_lists = getURLLists(url_header,url_tail,pages)
  datasets = getNBAAllData(url_lists)
  writeDataToTxt(datasets)
  sheetname = 'nba normal data 2016-2017'
  str_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))
  filename = 'nba_normal_data'+str_time+'.xls'
  saveDataToExcel(datasets,sheetname,filename)

二、python实现