python爬虫_完毕学园网自动重连脚本的科目

日期: 2019-12-06 15:23 浏览次数 :

本文实例叙述了Python爬虫完成全国失信被推行人名单查询功能。分享给大家供大家仿照效法,具体如下:

一、背景

本文实例陈述了Python实现爬取百度贴吧帖子全体楼层图片的爬虫。共享给我们供大家仿照效法,具体如下:

生龙活虎、需要表明

前段时间全校学校网不驾驭是怎么样情状,总现身掉线的情事。每回掉线都急需自家手动张开web浏览注重新张开账号密码输入,重新张开登入。系统的主题素材自个儿不能够消除,不过足以写二个简单易行的python脚本用于机动登陆军学园园网。每便掉线后,再张开大肆网页便是那么些页面。

下载百度贴吧帖子图片,好美观

动用百度的接口,完毕二个全国失信被施行人名单查询功用。输入人名,查询是或不是在全国失信被实行人名单中。

图片 1

图片 2

图片 3

二、实今世码

python2.7版本:

二、python实现

#-*- coding:utf-8 -*-
__author__ = 'pf'
import time
import requests
class Login:
 #初始化
 def __init__(self):
  #检测间隔时间,单位为秒
  self.every = 10
 #模拟登录
 def login(self):
  print self.getCurrentTime(), u"拼命连网中..."
  url="http://222.24.19.190:8080/portal/pws?t=li"
  #消息头
  headers={
  'Host':"222.24.19.190:8080",
  'User-Agent':"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0",
  'Accept':"application/json, text/javascript, */*; q=0.01",
  'Accept-Language':"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
  'Accept-Encoding':"gzip, deflate",
  'Referer':"http://222.24.19.190:8080/portal/index_default.jsp",
  'Content-Type':"application/x-www-form-urlencoded",
  'X-Requested-With':"XMLHttpRequest",
  'Content-Length':"291",
  'Connection':"close"
  }
  #提交的信息
  payload={
  'userName':'1403810041',
  'userPwd':'MTk4NDEy',
  'userurl':'http%3A%2F%2Fwww.msn.com%3Focid%3Dwispr&userip=222.24.52.200',
  'portalProxyIP':'222.24.19.190',
  'portalProxyPort':'50200',
  'dcPwdNeedEncrypt':'1',
  'assignIpType':'0',
  'appRootUrl':'=http%3A%2F%2F222.24.19.190%3A8080%2Fportal%2F',
  'manualUrlEncryptKey':'rTCZGLy2wJkfobFEj0JF8A%3D%3D'
  }
  try:
   r=requests.post(url,headers=headers,data=payload)
   print self.getCurrentTime(),u'连上了...现在开始看连接是否正常'
  except:
   print("error")
 #判断当前是否可以连网
 def canConnect(self):
  try:
   q=requests.get("http://www.baidu.com")
   if(q.status_code==200):
    return True
   else:
    return False
  except:
   print 'error'
 #获取当前时间
 def getCurrentTime(self):
  return time.strftime('[%Y-%m-%d %H:%M:%S]',time.localtime(time.time()))
 #主函数
 def main(self):
  print self.getCurrentTime(), u"Hi,欢迎使用自动登陆系统"
  while True:
   self.login()
   while True:
    can_connect = self.canConnect()
    if not can_connect:
     print self.getCurrentTime(),u"断网了..."
     self.login()
    else:
     print self.getCurrentTime(), u"一切正常..."
    time.sleep(self.every)
   time.sleep(self.every)
login = Login()
login.main()
#coding=utf-8
import re
import requests
import urllib
from bs4 import BeautifulSoup
import time
time1=time.time()
def getHtml(url):
  page = requests.get(url)
  html =page.text
  return html
def getImg(html):
  soup = BeautifulSoup(html, 'html.parser')
  img_info = soup.find_all('img', class_='BDE_Image')
  global index
  for index,img in enumerate(img_info,index+1):
    print ("正在下载第{}张图片".format(index))
    urllib.urlretrieve(img.get("src"),'C:/pic4/%s.jpg' % index)
def getMaxPage(url):
  html = getHtml(url)
  reg = re.compile(r'max-page="(d+)"')
  page = re.findall(reg,html)
  page = int(page[0])
  return page
if __name__=='__main__':
  url  = "https://tieba.baidu.com/p/5113603072"
  page = getMaxPage(url)
  index = 0
  for i in range(1,page):
    url = "%s%s" % ("https://tieba.baidu.com/p/5113603072?pn=",str(i))
    html = getHtml(url)
    getImg(html)
  print ("OK!All DownLoad!")
  time2=time.time()
  print u'总共耗时:' + str(time2 - time1) + 's'

版本1:

三、撤销步骤

PS:这里再为大家提供2款特别便于的正则表达式工具供我们参照他事他说加以考察运用:

# -*- coding:utf-8*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time
import requests
time1=time.time()
import pandas as pd
import json
iname=[]
icard=[]
def person_executed(name):
  for i in range(0,30):
    try:
      url="https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?resource_id=6899" 
      "&query=%E5%A4%B1%E4%BF%A1%E8%A2%AB%E6%89%A7%E8%A1%8C%E4%BA%BA%E5%90%8D%E5%8D%95" 
      "&cardNum=&" 
      "iname="+str(name)+ 
      "&areaName=" 
      "&pn="+str(i*10)+ 
      "&rn=10" 
      "&ie=utf-8&oe=utf-8&format=json"
      html=requests.get(url).content
      html_json=json.loads(html)
      html_data=html_json['data']
      for each in html_data:
        k=each['result']
        for each in k:
          print each['iname'],each['cardNum']
          iname.append(each['iname'])
          icard.append(each['cardNum'])
    except:
      pass
if __name__ == '__main__':
  name="郭**"
  person_executed(name)
  print len(iname)
  #####################将数据组织成数据框###########################
  data=pd.DataFrame({"name":iname,"IDCard":icard})
  #################数据框去重####################################
  data1=data.drop_duplicates()
  print data1
  print len(data1)
  #########################写出数据到excel#########################################
  pd.DataFrame.to_excel(data1,"F:\iname_icard_query.xlsx",header=True,encoding='gbk',index=False)
  time2=time.time()
  print u'ok,爬虫结束!'
  print u'总共耗时:'+str(time2-time1)+'s'

首先需求二个用来抓包的工具。大家要抓取提交的数额以至提交到的url地址。小编这里用的是firefox浏览器的httpfox插件。

JavaScript正则表达式在线测验工具:

三、效果展示

图片 4

正则表明式在线生成工具: