本篇文章通过实例给大家介绍一下nodejs实现简单网页爬虫功能的方法。有一定的参考价值,有需要的朋友可以参考一下,希望对大家有所帮助。
相关推荐:《nodejs》
网页源码
使用http.get()方法获取网页源码,以hao123网站的头条页面为例
http://tuijian.hao123.com/hotrank
var http = require('http');http.get('http://tuijian.hao123.com/hotrank',function(res){ var data = ''; res.on('data',function(chunk){ data += chunk; }); res.on('end',function(){ console.log(data); })});
登录后复制
获得的结果如下所示:
nbsp;html>热点排行榜-头条新闻-hao123新闻导航_hao123上网导航 window.pageId = window.pageId || "hao123-xinwen-tuijian-hotrank"; window.pageVP = window.pageVP || "hao123-xinwen-tuijian-hotrank"; window.HAO=window.HAO||{};window.HAO.https = false;window.HAO.httpsTrans = function(url){return url};window.aid = "nWRkrj61PjnYriYYrHfsrHbsnHb";BigPipe.lazyPagelets = [];BigPipe.loadedResource(["5a7c104a8_7959","d8b3cc9ac_29e3","38645dd_f7dd","8d1d978b0_a316","6cca09af6_f07f","a0832ac19_fb25","25330c25d_ce62","deba0d4c0_c8fe","1c81d5fc6_a695","0c7877e81_8719","6e9548c75_e646","38645dd_0f3e","3f6d691_9321","4d7a174_ccfc","9e71d5b_bed3","b016c1d_d1a3","e073b71_9403","77f7c66_45f3","95a138325_0731"]);BigPipe.hooks["__cb_0_1"]=function(){'use strict'; var $ = require('fe:widget/js/base/jquery.js'); var fixreferrer = require('fe:widget/js/base/fixreferrer.js'); HAO.https && fixreferrer.init($(document)); };BigPipe.hooks["__cb_0_2"]=function(){'use strict';var $ = require('fe:widget/js/base/jquery.js');$('div[data-hook="sitemap"]').on('mouseenter', function (e) {$(this).addClass('sitemap-hover');}).on('mouseleave', function (e) {$(this).removeClass('sitemap-hover');});};BigPipe.hooks["__cb_0_3"]=function(){'use strict';var $ = require('fe:widget/js/base/jquery.js');var Search = require('fe:widget/js/base/search.js');var headerSearchInstance = new Search($('form[data-hook="search-form"]'));};BigPipe.hooks["__cb_0_4"]=function(){'use strict';var $ = require('fe:widget/js/base/jquery.js');var events = require('fe:widget/js/lib/events.js');var login = require('fe:widget/js/base/login.js');var sethome = require('fe:widget/js/base/sethome.js');var $loginCon = $('div[data-hook="c-header-login"]');var $loginDrop = $('div[js-hook="popup-list"]');login.init();events.on('loginSuccess', function(userinfo) {$loginCon.addClass('success');$loginCon.find('.key .word').html(userinfo.userName);/* if ($loginCon.find('.key .word').width() >= 60) {$loginCon.find('.key .word').width(50);$loginDrop.outerWidth($loginCon.outerWidth());}*/$('[data-hook=login]').removeAttr('data-hook');});$loginCon.mouseenter(function() {if($(this).hasClass('success')) {$(this).addClass('hover');}}).mouseleave(function() {$(this).removeClass('hover');});$('div[data-hhok="qrcode"]').on('mouseenter', function () {$(this).children('div').show();}).on('mouseleave', function () {$(this).children('div').hide();}).on('click', function (ev) {if ($(this).children('div').length > 0) {return false;}});if($('[data-hook=setHome]').length) {sethome.init();}};BigPipe.hooks["__cb_0_5"]=function(){'use strict';var $ = require('fe:widget/js/base/jquery.js');var popupWidth;$('div[data-hook="nav-more"]').on('mouseenter', function () {popupWidth = $(this).children('div').width();$(this).addClass('nav-more-hover');}).on('mouseleave', function () {$(this).removeClass('nav-more-hover');});};BigPipe.hooks["__cb_0_6"]=function(){'use strict';var $ = require('fe:widget/js/base/jquery.js');var $v2Header = $('#erjiV2Header');var $fixedNav = $('#fixedNav');if ($v2Header.hasClass('v2-fixed') && !($.browser.msie && $.browser.version = offHeight) {if (!$fixedNav.hasClass('nav-v2-fixed')) {$fixedNav.addClass('nav-v2-fixed').find('li.cur').removeClass('cur').addClass('cur');}}else if ($fixedNav.hasClass('nav-v2-fixed')) {$fixedNav.removeClass('nav-v2-fixed').find('li.cur').removeClass('cur').addClass('cur');}});}};BigPipe.hooks["__cb_0_7"]=function(){'use strict';var $ = require('fe:widget/js/base/jquery.js');var Slider = require('fe:widget/js/util/slider.js');new Slider($('.slider'));};BigPipe.hooks["__cb_0_8"]=function(){'use strict';if(typeof BAIDU_SS_HHRUN!='function'){var d=document;(d.getElementsByTagName('head')[0]||d.body).appendChild(d.createElement('script')).src='http://su.bdimg.com/static/dspui/js/ls.js?v='+~(-new Date()/5600e5)}else{BAIDU_SS_HHRUN()}};BigPipe.hooks["__cb_0_9"]=function(){'use strict';var lifttop = require('tuijian:widget/lift/lifttop.js');lifttop();};BigPipe.hooks["__cb_0_10"]=function(){'use strict'; window._bd_share_config = { common : { bdText : '', bdDesc : '', bdUrl : '', bdPic : '' }, share : { "bdSize" : 24 }, selectShare : [{ "bdselectMiniList" : ['tsina','weixin','qzone'] }] }; (document.getElementsByTagName('head')[0]||document.body) .appendChild(document.createElement('script')).src='http://bdimg.share.baidu.com/static/api/js/share.js?v=89860593.js?cdnversion='+~(-new Date()); var shareEvent = require('tuijian:widget/index/content/shareEvent.js'); shareEvent(); };BigPipe.hooks["__cb_0_11"]=function(){'use strict';var addBookmark = require('fe:widget/js/base/addbookmark.js');addBookmark.init();};BigPipe.hooks["__cb_0_12"]=function(){'use strict'; (function initTrack(o){ var d = document; var x = d.createElement("script"); x.src = HAO.httpsTrans('http://s0.hao123img.com/res/js/track.js') + '?'+~(new Date/36e5); var a=[]; if(o){ for(var i in o){ a.push(i + ":" + (o[i])) } var config = a.join(";"); x.setAttribute("data-log-config", config); var s = d.getElementsByTagName("script")[0].parentNode; var p= s || d.head; if(p) { setTimeout(function() { p.appendChild(x) }, 0); } } })({ pageId: window.pageId, page: window.pageId, level: 2, vp: window.pageVP || window.pageId, aid: window.aid || '' }); window.js_track_loaded = function (success) { if (success) { window.js_track_loaded = null; if (window.aid) { /* globals Monkey */ Monkey && Monkey.set && Monkey.set('aid', window.aid); } } }; // 跨站资源统计 /* (function (doc) { var s = doc.createElement('script'); s.src = HAO.httpsTrans('http://s0.hao123img.com/res/js/fe/cspalog.js') + '?t=' + (+new Date); var parent = doc.getElementsByTagName('script')[0].parentNode; parent.appendChild(s); })(document); */ };BigPipe.hooks["__cb_0_13"]=function(){'use strict'; require.defer(["fe:widget/js/base/jquery.js","fe:widget/js/base/detect.js","tuijian:widget/index/kuaixun.js"], function ($, detect, kuaixun) { $(document).ready(function() { detect(); kuaixun.init(); }); });};BigPipe.setResourceMap({"d8b3cc9ac_29e3":{"src":"http://s1.hao123img.com/resource/fe/pkg/aio-eef856ab5.231bb088c.css","type":"css","deps":[],"mods":["fe:resource/css/base.less"]},"38645dd_f7dd":{"src":"http://s2.hao123img.com/resource/tuijian/css/hotrank.38645dd.css","type":"css","deps":[],"mods":["tuijian:resource/css/hotrank.less"]},"8d1d978b0_a316":{"src":"http://s1.hao123img.com/resource/fe/widget/ui/header/common/v2/header.8d1d978b0.css","type":"css","deps":[],"mods":["fe:widget/ui/header/common/v2/header.less"]},"6cca09af6_f07f":{"src":"http://s0.hao123img.com/resource/fe/widget/ui/header/common/v2/logo/logo.6cca09af6.css","type":"css","deps":[],"mods":["fe:widget/ui/header/common/v2/logo/logo.less"]},"a0832ac19_fb25":{"src":"http://s1.hao123img.com/resource/fe/widget/ui/header/common/v2/sitemap/sitemap.a0832ac19.css","type":"css","deps":[],"mods":["fe:widget/ui/header/common/v2/sitemap/sitemap.less"]},"25330c25d_ce62":{"src":"http://s2.hao123img.com/resource/fe/widget/ui/header/common/v2/adv/adv.25330c25d.css","type":"css","deps":[],"mods":["fe:widget/ui/header/common/v2/adv/adv.less"]},"deba0d4c0_c8fe":{"src":"http://s0.hao123img.com/resource/fe/widget/ui/header/common/v2/form/form.deba0d4c0.css","type":"css","deps":[],"mods":["fe:widget/ui/header/common/v2/form/form.less"]},"1c81d5fc6_a695":{"src":"http://s0.hao123img.com/resource/fe/widget/ui/header/common/v2/tools/tools.1c81d5fc6.css","type":"css","deps":[],"mods":["fe:widget/ui/header/common/v2/tools/tools.less"]},"0c7877e81_8719":{"src":"http://s2.hao123img.com/resource/fe/widget/ui/header/common/v2/nav/nav.0c7877e81.css","type":"css","deps":[],"mods":["fe:widget/ui/header/common/v2/nav/nav.less"]},"6e9548c75_e646":{"src":"http://s2.hao123img.com/resource/fe/widget/ui/header/common/v2/tuiguang/tuiguang.6e9548c75.css","type":"css","deps":[],"mods":["fe:widget/ui/header/common/v2/tuiguang/tuiguang.less"]},"38645dd_0f3e":{"src":"http://s0.hao123img.com/resource/tuijian/widget/index/hotrank/hotrank.38645dd.css","type":"css","deps":[],"mods":["tuijian:widget/index/hotrank/hotrank.less"]},"3f6d691_9321":{"src":"http://s2.hao123img.com/resource/tuijian/widget/index/hotrank/index/slider/slider.3f6d691.css","type":"css","deps":[],"mods":["tuijian:widget/index/hotrank/index/slider/slider.less"]},"4d7a174_ccfc":{"src":"http://s0.hao123img.com/resource/tuijian/widget/index/hotrank/common/slider/slider.4d7a174.css","type":"css","deps":[],"mods":["tuijian:widget/index/hotrank/common/slider/slider.less"]},"9e71d5b_bed3":{"src":"http://s0.hao123img.com/resource/tuijian/widget/index/hotrank/index/news/news.9e71d5b.css","type":"css","deps":[],"mods":["tuijian:widget/index/hotrank/index/news/news.less"]},"b016c1d_d1a3":{"src":"http://s1.hao123img.com/resource/tuijian/widget/index/hotrank/index/fyb/fyb.b016c1d.css","type":"css","deps":[],"mods":["tuijian:widget/index/hotrank/index/fyb/fyb.less"]},"e073b71_9403":{"src":"http://s0.hao123img.com/resource/tuijian/widget/index/hotrank/index/top/top.e073b71.css","type":"css","deps":[],"mods":["tuijian:widget/index/hotrank/index/top/top.less"]},"77f7c66_45f3":{"src":"http://s0.hao123img.com/resource/tuijian/widget/lift/lift.77f7c66.css","type":"css","deps":[],"mods":["tuijian:widget/lift/lift.less"]},"95a138325_0731":{"src":"http://s2.hao123img.com/resource/fe/pkg/aio-8155b5719.3dd99d32e.css","type":"css","deps":[],"mods":["fe:widget/ui/footer/common/footer.less"]},"ed29b1dff_99f2":{"src":"http://s1.hao123img.com/resource/fe/pkg/aio-752ba7752.ed29b1dff.js","type":"js","deps":[],"mods":["fe:widget/js/base/jquery.js"]},"499abaa0e_acda":{"src":"http://s0.hao123img.com/resource/fe/pkg/aio-eef856ab5.499abaa0e.js","type":"js","deps":["ed29b1dff_99f2","15f327f0a_5d72"],"mods":["fe:widget/js/base/browser.js","fe:widget/js/base/fixreferrer.js"]},"15f327f0a_5d72":{"src":"http://s0.hao123img.com/resource/fe/pkg/aio-95cc3013d.15f327f0a.js","type":"js","deps":["ed29b1dff_99f2"],"mods":["fe:widget/js/base/cookie.js"]},"331938377_b942":{"src":"http://s0.hao123img.com/resource/fe/pkg/aio-1c2d6f9f2.2b182a527.css","type":"css","deps":[],"mods":["fe:widget/ui/header/common/header.less"]},"2009b1512_46d0":{"src":"http://s0.hao123img.com/resource/fe/pkg/aio-1c2d6f9f2.2009b1512.js","type":"js","deps":["ed29b1dff_99f2","15f327f0a_5d72","331938377_b942"],"mods":["fe:widget/js/base/sethome.js","fe:widget/js/lib/events.js","fe:widget/js/base/login.js","fe:widget/js/third/arttemplate/template-native.js","fe:widget/js/base/autocomplete.js","fe:widget/js/base/search.js","fe:widget/ui/header/common/header.js"]},"9a092a7f1_2a6f":{"src":"http://s0.hao123img.com/resource/fe/widget/js/util/slider.9a092a7f1.js","type":"js","deps":["ed29b1dff_99f2"],"mods":["fe:widget/js/util/slider.js"]},"f271c78_c7d7":{"src":"http://s0.hao123img.com/resource/tuijian/widget/lift/lifttop.f271c78.js","type":"js","deps":["ed29b1dff_99f2"],"mods":["tuijian:widget/lift/lifttop.js"]},"4d39d64_93de":{"src":"http://s1.hao123img.com/resource/tuijian/widget/index/content/shareEvent.4d39d64.js","type":"js","deps":["ed29b1dff_99f2"],"mods":["tuijian:widget/index/content/shareEvent.js"]},"3ac67f28c_b365":{"src":"http://s2.hao123img.com/resource/fe/pkg/aio-8155b5719.3ac67f28c.js","type":"js","deps":["ed29b1dff_99f2"],"mods":["fe:widget/js/base/addbookmark.js"]},"67402ee5d_d72b":{"src":"http://s2.hao123img.com/resource/fe/widget/js/base/track.67402ee5d.js","type":"js","deps":["ed29b1dff_99f2"],"mods":["fe:widget/js/base/track.js"]},"f97e9ecfd_31c5":{"src":"http://s1.hao123img.com/resource/fe/widget/js/base/detect.f97e9ecfd.js","type":"js","deps":["67402ee5d_d72b"],"mods":["fe:widget/js/base/detect.js"]},"2e29525_fe44":{"src":"http://s1.hao123img.com/resource/tuijian/widget/index/kuaixun.2e29525.js","type":"js","deps":["ed29b1dff_99f2"],"mods":["tuijian:widget/index/kuaixun.js"]},"5a7c104a8_7959":{"src":"http://s2.hao123img.com/resource/fe/js/lib/main.5a7c104a8.js","type":"js","deps":[],"mods":["fe:resource/js/lib/main.js"]}});BigPipe.onPageletArrive({"id":null,"children":[],"renderMode":"default","parent":null,"deps":{"beforedisplay":["d8b3cc9ac_29e3","38645dd_f7dd","8d1d978b0_a316","6cca09af6_f07f","a0832ac19_fb25","25330c25d_ce62","deba0d4c0_c8fe","1c81d5fc6_a695","0c7877e81_8719","6e9548c75_e646","38645dd_0f3e","3f6d691_9321","4d7a174_ccfc","9e71d5b_bed3","b016c1d_d1a3","e073b71_9403","77f7c66_45f3","95a138325_0731"],"load":["ed29b1dff_99f2","499abaa0e_acda","2009b1512_46d0","9a092a7f1_2a6f","f271c78_c7d7","4d39d64_93de","3ac67f28c_b365"]},"hooks":{"load":["__cb_0_1","__cb_0_2","__cb_0_3","__cb_0_4","__cb_0_5","__cb_0_6","__cb_0_7","__cb_0_8","__cb_0_9","__cb_0_10","__cb_0_11","__cb_0_12","__cb_0_13"]}}); var _trace_page_logid = 2434336151;{di:"u0000",tn:"sitehao123_03",rsi0:"1190",rsi1:"150",type:"metro",version:"201",style:"lichun"}
登录后复制
筛选数据
以网页中的综艺热点部分
相关源代码如下
通过分析可知,‘综艺’模块与其他模块都位于
中,其中,综艺模块的内层p的monkey=’zy’,综艺模块的10条综艺节目的信息都位于
中,综艺节目的名称位于中
cheerio
我们怎么从源代码中获取到有用的数据呢?首先,nodeJS不支持document对象。如果要使用笨办法,只能使用正则表达式来处理
cheerio 是nodejs特别为服务端定制的,能够快速灵活的对JQuery核心进行实现。它工作于DOM模型上,且解析、操作、呈送都很高效
【安装】
【使用】
它的使用方法和jQuery相当类似,上手非常容易。以获取综艺热度前10名的节目名称为例
var http = require('http');var cheerio = require('cheerio');http.get('http://tuijian.hao123.com/hotrank',function(res){ var data = ''; res.on('data',function(chunk){ data += chunk; }); res.on('end',function(){ filter(data); })});function filter(data){ //保存搜索量前10的综艺节目标题 var result = []; //将页面源代码转换为$对象 var $ = cheerio.load(data); //查找每个综艺节目标题的外层div var temp_arr = $('[monkey = "zy"]').find('.point-bd').find('.point-title'); //将综艺节目标题依次保存到结果数组中 temp_arr.each(function(index,item){ result.push($(item).text()); }) //[ '变形计','来吧冠军','拜托了冰箱','昆仑决','天生是优我','姐姐好饿','脑力男人时代','奔跑吧兄弟','我想和你唱','玫瑰之旅' ] console.log(result);}
登录后复制
爬虫代码
下面将hao123网页中的’实时热点’、’今日热点’、’民生热点’、’电影’、’电视剧’、’综艺’这6部分的排名爬下来,分别到对象名为’result’中的数组中,分别命令为’ss’、’jr’、’ms’、’dy’、’dsj’、’zy’
【代码如下】
var http = require('http');var cheerio = require('cheerio');http.get('http://tuijian.hao123.com/hotrank',function(res){ var data = ''; res.on('data',function(chunk){ data += chunk; }); res.on('end',function(){ filter(data); })});function filter(data){ //保存各部分搜索量前10的名称 //对象名为榜单名,如'实时热点' //对象内容为10个标题名称组成的数组 var result = {}; //将页面源代码转换为$对象 var $ = cheerio.load(data); //查找'实时热点'、'今日热点'、'民生热点'、'电影'、'电视剧'、'综艺'这6个榜单所在的div var temp_div = $('.top-wrap'); //保存榜单名称 var temp_title = []; temp_div.each(function(index,item){ //查找榜单名,并保存到temp_title文件夹中 temp_title.push($(item).find('h2').text()); //查找每类下每个标题的外层div var temp_arr = $(item).find('.point-bd').find('.point-title'); //将result下的每个榜单初始化为一个数组 var innerResult = result[temp_title[index]] = []; //将节目标题依次保存到相应榜单的数组中 temp_arr.each(function(_index,_item){ innerResult.push($(_item).text()) }) }) console.log(result);}
登录后复制
【结果如下】
{ '实时热点': [ '美国逮捕女斯诺登', '成都隐秘母乳买卖', '曝周杰伦青涩旧照', '老头公交强吻女孩', '王传君恋情曝光', '杭州现奇葩窗口', '忘带全班准考证', '未成年持械拍网红', '9秒揍儿子8拳', '戴耳机穿轨道被撞' ], '今日热点': [ '北京回龙观大火', '选美冠军车祸身亡', '2017高考', '成都老火锅店被查', '陈浩民娇妻秀身材', '海边直播发现浮尸', '曝印小天遭妻骗婚', '苹果开发者大会', '6万斤鱼缺氧死亡', '安以轩夏威夷大婚' ], '民生热点': [ '北京回龙观大火', '2017高考', '成都老火锅店被查', '海边直播发现浮尸', '苹果开发者大会', '6万斤鱼缺氧死亡', '北控外援训练猝死', '武汉男子裸体捅人', '多国与卡塔尔断交', '美驻华外交官辞职' ], '电影': [ '神奇女侠', '异星觉醒', '新木乃伊', '中国推销员', '荡寇风云', '异兽来袭', '李雷和韩梅梅', '北极星', '美好的意外', '夏天19岁的肖像' ], '电视剧': [ '龙珠传奇', '楚乔传', '欢乐颂2', '欢乐颂', '职场是个技术活', '择天记', '美食大冒险', '废柴兄弟', '人民的名义', '三生三世十里桃花' ], '综艺': [ '变形计', '来吧冠军', '拜托了冰箱', '昆仑决', '天生是优我', '姐姐好饿', '脑力男人时代', '奔跑吧兄弟', '我想和你唱', '玫瑰之旅' ] }[Finished in 0.7s]
登录后复制
更多编程相关知识,请访问:nodejs!!
以上就是使用nodejs实现一个简单的网页爬虫功能(附代码)的详细内容,更多请关注【创想鸟】其它相关文章!
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件至253000106@qq.com举报,一经查实,本站将立刻删除。
发布者:PHP中文网,转转请注明出处:https://www.chuangxiangniao.com/p/2717728.html