Python模拟登陆淘宝并统计淘宝消费情况的代码实例分享

支付宝十年账单上的数字有点吓人,但它统计的项目太多,只是想看看到底单纯在淘宝上支出了多少,于是写了段脚本,统计任意时间段淘宝订单的消费情况,看那结果其实在淘宝上我还是相当节约的说。
脚本的主要工作是模拟了浏览器登录,解析“已买到的宝贝”页面以获得指定的订单及宝贝信息。

201674184737787.gif (410×235)

使用方法见代码或执行命令加参数-h,另外需要BeautifulSoup4支持,BeautifulSoup的官方项目列表页:https://www.crummy.com/software/BeautifulSoup/bs4/download/

首先来说一下代码使用方法:

python taobao.py -u USERNAME -p PASSWORD -s START-DATE -e END-DATE --verbose

登录后复制

所有参数均可选,如:

python taobao.py -u jinnlynn 

登录后复制

统计用户jinnlynn所有订单的情况

python taobao.py -s 2014-12-12 -e 2014-12-12

登录后复制

统计用户(用户名在命令执行时会要求输入)在2014-12-12当天的订单情况

python taobao.py --verbose

登录后复制

这样就可以统计并输出订单明细。

好了,说了这么多我们就来看代码吧:

from __future__ import unicode_literals, print_function, absolute_import, divisionimport urllibimport urllib2import urlparseimport cookielibimport reimport sysimport osimport jsonimport subprocessimport argparseimport platformfrom getpass import getpassfrom datetime import datetimefrom pprint import pprinttry:  from bs4 import BeautifulSoupexcept ImportError:  sys.exit('BeautifulSoup4 missing.')__version__ = '1.0.0'__author__ = 'JinnLynn'__copyright__ = 'Copyright (c) 2014 JinnLynn'__license__ = 'The MIT License'HEADERS = {  'x-requestted-with' : 'XMLHttpRequest',  'Accept-Language' : 'zh-cn',  'Accept-Encoding' : 'gzip, deflate',  'ContentType' : 'application/x-www-form-urlencoded; chartset=UTF-8',  'Cache-Control' : 'no-cache',  'User-Agent' :'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.38 Safari/537.36',  'Connection' : 'Keep-Alive'}DEFAULT_POST_DATA = {  'TPL_username' : '', #用户名  'TPL_password' : '', #密码  'TPL_checkcode' : '',  'need_check_code' : 'false',  'callback' : '0', # 有值返回JSON}# 无效订单状态INVALID_ORDER_STATES = [  'CREATE_CLOSED_OF_TAOBAO', # 取消  'TRADE_CLOSED', # 订单关闭]LOGIN_URL = 'https://login.taobao.com/member/login.jhtml'RAW_IMPUT_ENCODING = 'gbk' if platform.system() == 'Windows' else 'utf-8'def _request(url, data, method='POST'):  if data:    data = urllib.urlencode(data)  if method == 'GET':    if data:      url = '{}?{}'.format(url, data)    data = None  # print(url)  # print(data)  req = urllib2.Request(url, data, HEADERS)  return urllib2.urlopen(req)def stdout_cr(msg=''):  sys.stdout.write('{:10}'.format(' '))  sys.stdout.write('{}'.format(msg))  sys.stdout.flush()def get(url, data=None):  return _request(url, data, method='GET')def post(url, data=None):  return _request(url, data, method='POST')def login_post(data):  login_data = DEFAULT_POST_DATA  login_data.update(data)  res = post(LOGIN_URL, login_data)  return json.load(res, encoding='gbk')def login(usr, pwd):  data = {    'TPL_username' : usr.encode('utf-8' if platform.system() == 'Windows' else 'GB18030'),    'TPL_password' : pwd  }  # 1. 尝试登录  ret = login_post(data)  while not ret.get('state', False):    code = ret.get('data', {}).get('code', 0)    if code == 3425 or code == 1000:      print('INFO: {}'.format(ret.get('message')))      check_code = checkcode(ret.get('data', {}).get('ccurl'))      data.update({'TPL_checkcode' : check_code, 'need_check_code' : 'true'})      ret = login_post(data)    else:      sys.exit('ERROR. code: {}, message:{}'.format(code, ret.get('message', '')))  token = ret.get('data', {}).get('token')  print('LOGIN SUCCESS. token: {}'.format(token))  # 2. 重定向  # 2.1 st值  res = get('https://passport.alipay.com/mini_apply_st.js', {    'site' : '0',    'token' : token,    'callback' : 'stCallback4'})  content = res.read()  st = re.search(r'"st":"(S*)"( |})', content).group(1)  # 2.1 重定向  get('http://login.taobao.com/member/vst.htm',    {'st' : st, 'TPL_uesrname' : usr.encode('GB18030')})def checkcode(url):  filename, _ = urllib.urlretrieve(url)  if not filename.endswith('.jpg'):    old_fn = filename    filename = '{}.jpg'.format(filename)    os.rename(old_fn, filename)  if platform.system() == 'Darwin':    # mac 下直接preview打开    subprocess.call(['open', filename])  elif platform.system() == 'Windows':    # windows 执行文件用默认程序打开    subprocess.call(filename, shell=True)  else:    # 其它系统 输出文件名    print('打开该文件获取验证码: {}'.format(filename))  return raw_input('输入验证码: '.encode(RAW_IMPUT_ENCODING))def parse_bought_list(start_date=None, end_date=None):  url = 'http://buyer.trade.taobao.com/trade/itemlist/list_bought_items.htm'  #         运费险      增值服务     分段支付(定金,尾款)  extra_service = ['freight-info', 'service-info', 'stage-item']  stdout_cr('working... {:.0%}'.format(0))  # 1. 解析第一页  res = urllib2.urlopen(url)  soup = BeautifulSoup(res.read().decode('gbk'))  # 2. 获取页数相关  page_jump = soup.find('span', id='J_JumpTo')  jump_url = page_jump.attrs['data-url']  url_parts = urlparse.urlparse(jump_url)  query_data = dict(urlparse.parse_qsl(url_parts.query))  total_pages = int(query_data['tPage'])  # 解析  orders = []  cur_page = 1  out_date = False  errors = []  while True:    bought_items = soup.find_all('tbody', attrs={'data-orderid' : True})    # pprint(len(bought_items))    count = 0    for item in bought_items:      count += 1      # pprint('{}.{}'.format(cur_page, count))      try:        info = {}        # 订单在页面上的位置 页数.排序号        info['pos'] = '{}.{}'.format(cur_page, count)        info['orderid'] = item.attrs['data-orderid']        info['status'] = item.attrs['data-status']        # 店铺        node = item.select('tr.order-hd a.shopname')        if not node:          # 店铺不存在,可能是赠送彩票订单,忽略          # print('ignore')          continue        info['shop_name'] = node[0].attrs['title'].strip()        info['shop_url'] = node[0].attrs['href']        # 日期        node = item.select('tr.order-hd span.dealtime')[0]        info['date'] = datetime.strptime(node.attrs['title'], '%Y-%m-%d %H:%M')        if end_date and info['date'].toordinal() > end_date.toordinal():          continue        if start_date and info['date'].toordinal()  1:                bb['snapshot'] = name_node[1].attrs['href']              # 宝贝规格              bb['spec'] = n.select('.spec')[0].text.strip()              # 宝贝价格              bb['price'] = float(n.find('td', class_='price').attrs['title'])              # 宝贝数量              bb['quantity'] = int(n.find('td', class_='quantity').attrs['title'])              bb['is_goods'] = True            baobei.append(bb)            # 尝试获取实付款            # 实付款所在的节点可能跨越多个tr的td            amount_node = n.select('td.amount em.real-price')            if amount_node:              info['amount'] = float(amount_node[0].text)          except Exception as e:            errors.append({              'type' : 'baobei',              'id' : '{}.{}'.format(cur_page, count),              'node' : '{}'.format(n),              'error' : '{}'.format(e)            })      except Exception as e:        errors.append({          'type' : 'order',          'id' : '{}.{}'.format(cur_page, count),          'node' : '{}'.format(item),          'error' : '{}'.format(e)        })      info['baobei'] = baobei      orders.append(info)    stdout_cr('working... {:.0%}'.format(cur_page / total_pages))    # 下一页    cur_page += 1    if cur_page > total_pages or out_date:      break    query_data.update({'pageNum' : cur_page})    page_url = '{}?{}'.format(url, urllib.urlencode(query_data))    res = urllib2.urlopen(page_url)    soup = BeautifulSoup(res.read().decode('gbk'))  stdout_cr()  if errors:    print('INFO. 有错误发生,统计结果可能不准确。')    # pprint(errors)  return ordersdef output(orders, start_date, end_date):  amount = 0.0  org_amount = 0  baobei_count = 0  order_count = 0  invaild_order_count = 0  for order in orders:    if order['status'] in INVALID_ORDER_STATES:      invaild_order_count += 1      continue    amount += order['amount']    order_count += 1    for baobei in order.get('baobei', []):      if not baobei['is_goods']:        continue      org_amount += baobei['price'] * baobei['quantity']      baobei_count += baobei['quantity']  print('{: end_date:    sys.exit('ERROR, 结束日期必须晚于或等于开始日期')  cj_file = './{}.tmp'.format(usr)  cj = cookielib.LWPCookieJar()  try:    cj.load(cj_file)  except:    pass  opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPHandler)  urllib2.install_opener(opener)  login(usr, pwd)  try:    cj.save(cj_file)  except:    pass  orders = parse_bought_list(start_date, end_date)  output(orders, start_date, end_date)  # 输出订单明细  if verbose:    ouput_orders(orders)if __name__ == '__main__':  main()

登录后复制

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至253000106@qq.com举报,一经查实,本站将立刻删除。

发布者:PHP中文网,转转请注明出处:https://www.chuangxiangniao.com/p/2285829.html

(0)
上一篇 2025年2月27日 21:43:14
下一篇 2025年2月19日 14:23:53

AD推荐 黄金广告位招租... 更多推荐

相关推荐

发表回复

登录后才能评论