Article From:https://segmentfault.com/q/1010000011700715
Question:
#-*-coding:utf-8-*-
import requests

def load_url(url,file_name):
   try:
       my_headers = {
           'Agent-User': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/5.0.354.0 Safari/533.3'}
       re=requests.get(url,headers=my_headers)
       re.raise_for_status()
       re.encoding=re.apparent_encoding
       print('Crawling%s content to complete'%file_name)Return re.textExcept:Print ("crawl failure!")Def save_data (data, file)_name):Print ('save%s'%file_name file)With open (file_name,'w') as f:F.write (data)Print ('file%s save is complete! '%file_name)Def spider (kW, begin, end):For page in range (begin, end+1):Pn= (begin-1) *50Kw={'kw': kw}Full_url='http://tieba.baidu.com/f?'+'kw='+kw['kw']+'& ie=utf-8& pn='+str (PN)Print (full_url)File_name='web page'+str (page) +'.html'Html=load_url (full_url, file_name)Save_data (HTML, file_name)If __name__=='__main__':#url ='http://tieba.baidu.com/f?Kw=input (please enter the name of the crawl Post Bar: ').Begin=int (input ("please enter the page number:")End=int (input ("page number:")Spider (kW, begin, end)Error message:F:\Python\python.exe F:/Python/ exercise clip /Spider/tiebaCase.pyPlease enter the name of crawling Post Bar: Wolf 2.Please enter the page number of crawl start: 1Page number: 2Http://tieba.baidu.com/f? Kw= wolf 2& ie=utf-8& pn=0Crawl web page 1.html content completedTraceback (most recent call last):Save file page 1.htmlFile "F:/Python/ exercise clip /sPider/tiebaCase.py ", line 37, in < module>Spider (kW, begin, end)File "F:/Python/ exercise clip /spideR/tiebaCase.py ", line 30, in spiderSave_data (HTML, file_name)File "F:/Python/ exercise clip /spider/tiebaCase.py ", line 19, in save_dataF.write (data)UnicodeEncodeError:'gbk'codec can't encode charaCter'\xe7'in position 265: illegal multibyte sequenceThis error is solved.

Answer 0:
# coding: utf-8

from __future__ import unicode_literals

import requests
import codecs

def load_url(url, file_name):
    try:
        my_headers = {
            'Agent-User': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/5.0.354.0 Safari/533.3'}
        re = requests.get(url, headers=my_headers)
        print('Crawl%s content completed '% file_name)Return re.textExcept:Print ("crawl failure!")Def save_data (DAT)A, file_name):Print ('save file%s'% file_name)Print dataWith codecs.open (file_name,'w','utf-8') as f:F.write (data)Print ('file%s save is complete! '% file_name'Def spider (kW, begin, EN)D):For page in range (begin, end + 1):PN = (begin - 1) * 50KW = {'kw': kw}Full_url ='http://tieba.baidu.com/f?? +'kw='+ kw['kw'] +'& ie=utf-8& pn=' + str (PN)Print (full_url)File_name = 'web page' + str (page) +'.html'HTML = load_url (full_u)RL, file_name)Save_data (HTML, file_name)If __name__ = ='__main__':Spider (wolves 2', 1,2)

Similar Posts:

Leave a Reply

Your email address will not be published. Required fields are marked *