Article From:https://www.cnblogs.com/Python1234/p/9063353.html

First you have to register an account, you can skip the enterprise verification, the recruitment of the dog website is for the enterprise HR, so the general requirements for enterprise verification, here we directly skip the enterprise verification, the following is the implementation process, there are detailed notes:

  1. import json

  2. import os

  3. import random

  4. import re

  5. import sys

  6. import traceback

  7. import time

  8. from PIL import Image

  9. from lxml import html as lxml_html

  10. import selenium

  11. from selenium import webdriver

  12. from selenium.common.exceptions import NoSuchElementException

  13. from selenium.webdriver import ActionChains

  14. import requests

  15. import base64

  16. from requests.exceptions import ConnectionError

  17. import http.cookiejar

  18. import logging

  19. from dama2_API import Dama2API

  20. #Random access to third party libraries of useragent

  21. from fake_useragent import UserAgent

  22. ua = UserAgent()

  23. class RTC_zhaopingou(object):

  24. def __init__(self, account: dict, debug=False, visible=-1, last_try=False):

  25. assert account[‘user_id’]

  26. assert account[‘password’]

  27. logging.info(‘Change webdriver to FireFox’)

  28. #Create seeion object, crawl list page and detail page to use.

  29. self.session = requests.Session()

  30. self.session.headers = {

  31. ‘Host’: “qiye.zhaopingou.com”,

  32. “Origin”:”http://qiye.zhaopingou.com”,

  33. “Referer”:”http://qiye.zhaopingou.com”,

  34. “User-Agent”:”Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36″,

  35. }

  36. #You need to register the code rabbit account, download the code from the code rabbit platform.

  37. self.dama2 = Dama2API()

  38. def login(self):

  39. l = logging

  40. l.info(“Processing Login…”)

  41. self.driver = webdriver.Firefox()

  42. self.driver.set_window_size(1920, 1080)

  43. self.driver.implicitly_wait(10)

  44. driver = self.driver

  45. # login_url = ‘http://qiye.zhaopingou.com/zhaopingou_interface/security_login?timestamp=’+str(int(time.time()*1000))

  46. login_url = ‘http://qiye.zhaopingou.com/’

  47. driver.get(login_url)

  48. #After opening the page, you need to choose the city

  49. driver.find_element_by_xpath(‘//div[@class=”city-now citys”]’).click()

  50. #Find the user name and password element to imitate human manual input.

  51. for i inself.account[‘username’]:

  52. driver.find_element_by_xpath(‘//input[@placeholder=”Please enter cell phone number / mailbox / dog number “” “.Send_keys (I).

  53. time.sleep(random.uniform(0.2,0.8))

  54. for j inself.account[‘password’]:

  55. driver.find_element_by_xpath(‘//input[@placeholder=”Please enter the password “] ‘).Send_keys (J)

  56. time.sleep(random.uniform(0.2, 0.8))

  57. # To get the button element of the pop-up verification code, there is a pit, the button element can not be obtained directly in the iframe node, and it needs to be cut into the first iframe through the driver.find_element_by_tag_name (“iframe”).And then get the button element through XPath

  58. # iframe = driver.find_element_by_id(‘captcha_widget_aiwaylekc’)

  59. driver.switch_to.frame(driver.find_element_by_tag_name(“iframe”))

  60. # driver.switch_to.frame(‘captcha_widget_aiwaylekc’)

  61. driver.find_element_by_xpath(‘//span[@class=”captcha-widget-text”]’).click()

  62. #Wait 5 seconds to avoid situations that are sometimes not loaded. Switch from iframe to main HTML page through driver.switch_to.default_content ().

  63. time.sleep(5)

  64. driver.switch_to.default_content()

  65. #After clicking the pop-up verification code button, a new iframe appears. At this time, there are two iframe, side by side, from this page to second iframe.

  66. driver.switch_to.frame(driver.find_elements_by_tag_name(“iframe”)[1])

  67. # Verifying code area

  68. captcha_xpath = ‘//div[@class=”lc-panel”]’

  69. # captcha_xpath = ‘#l-captcha-float_aiwaylekc’

  70. re = self._login_process_captcha(captcha_xpath)

  71. #Login success

  72. if re:

  73. driver.switch_to.default_content()

  74. driver.find_element_by_id(‘form_login’).click()

  75. time.sleep(3)

  76. current_url = driver.current_url

  77. #Determine whether the URL after the login is the expected value

  78. expect_url = ‘http://qiye.zhaopingou.com/’

  79. if current_url==expect_url:

  80. l.info(‘login sucess!!!’)

  81. #Get cookie and save cookie to session for the use of crawler list page and detail page.

  82. cookie = dict()

  83. print(driver.get_cookies())

  84. for item in driver.get_cookies():

  85. # cookie += “; {}={}”.format(item[‘name’], item[“value”])

  86. cookie[item[‘name’]] = item[‘value’]

  87. if item[‘name’] == ‘hrkeepToken’:

  88. self.token = item[‘value’]

  89. # Storage cookie

  90. self.session.cookies = requests.utils.cookiejar_from_dict(cookie, self.cookiejar)

  91. l.info(“get cookie: {}”.format(cookie))

  92. #Login successfully, exit driver, not used later

  93. self.driver.quit()

  94. returnTrue

  95. else:

  96. l.info(‘login failed due to CAPTCHA, submit_count’)

  97. returnFalse

  98. def _login_process_captcha(self,captcha_xpath):

  99. l = logging

  100. driver = self.driver

  101. captcha_element = driver.find_element_by_xpath(captcha_xpath)

  102. #Verification code coordinates and sizes

  103. offset = captcha_element.location

  104. print(‘offset:’,offset)

  105. size = captcha_element.size

  106. # Verifying code interface

  107. dama2 = self.dama2

  108. #Save the verifying code picture

  109. shm_dir = r’/tmp/zhaopingou/’

  110. if os.path.exists(shm_dir) isFalse:

  111. os.makedirs(shm_dir)

  112. captcha_img_path = os.path.join(shm_dir, ‘captcha_img_{user_id}.png’.format(user_id=self.account[‘user_id’]))

  113. maximum = 20

  114. attempt = 0

  115. while attempt<=maximum:

  116. l.info(f’Trying to decode CAPTCHA: {attempt}/{maximum}’)

  117. #Verifying code element

  118. captcha_element = driver.find_element_by_xpath(captcha_xpath)

  119. #Capture the verifying code picture to captcha_img_path

  120. captcha_element.screenshot(captcha_img_path)

  121. try:

  122. #Call the rabbit interface, enter the verification code type, the verification code picture file, return the coordinate value coordinate_list

  123. captcha_id, coordinate_list = dama2.decode_captcha(captcha_type=6137, file_path=captcha_img_path)

  124. l.info(f’coordinate_list:{coordinate_list}’)

  125. except Exception as err:

  126. err_str = str(err)

  127. tb = traceback.format_exc()

  128. msg = f’Exception occurred when decode CAPTCHA, err: {err_str}, tb:\n{tb}’

  129. l.warning(msg)

  130. attempt+=1

  131. # Return to the main page first when an exception occurs

  132. continue

  133. #Move the mouse to the coordinates of the return and click

  134. for xy in coordinate_list:

  135. action = ActionChains(driver)

  136. action.move_to_element_with_offset(captcha_element, xy[0], xy[1]).click()

  137. action.perform()

  138. time.sleep(random.uniform(0.5,2))

  139. #First cut back to the main HTML, then cut to the first iframe, get the pop-up validation button before judging whether the content is verified successfully.

  140. driver.switch_to.default_content()

  141. driver.switch_to.frame(driver.find_elements_by_tag_name(“iframe”)[0])

  142. text = driver.find_element_by_xpath(‘//span[@class=”captcha-widget-text”]’).text

  143. if text.find(‘Verify success’)! =-1:

  144. l.info(‘Verification code verification success! ‘)

  145. time.sleep(random.uniform(1,2))

  146. returnTrue

  147. else: #Fail to cut back to the second iframe and get the verification code from the new one.

  148. driver.switch_to.default_content()

  149. driver.switch_to.frame(driver.find_elements_by_tag_name(“iframe”)[1])

  150. l.info(‘fail,and try it again’)

  151. attempt+=1

  152. time.sleep(2)

  153. continue

  154. returnFalse

  155. #Get the list page by searching keywords, and locate a page.

  156. def search(self, keyword, page_to_go):

  157. ””’Search resume, get list page, data in JSON format ”

  158. l = logging

  159. assert keyword

  160. self.keyword = keyword

  161. # Use the Firefox browser to grab the post request parameters

  162. params = {

  163. “pageSize”:page_to_go,

  164. “pageNo”:”25″,

  165. “keyStr”:keyword,

  166. “companyName”:””,

  167. “schoolName”:””,

  168. “keyStrPostion”:””,

  169. “postionStr”:””,

  170. “startDegrees”:”-1″,

  171. “endDegress”:”-1″,

  172. “startAge”:”0″,

  173. “endAge”:”0″,

  174. “gender”:”-1″,

  175. “region”:””,

  176. “timeType”:”-1″,

  177. “startWorkYear”:”-1″,

  178. “endWorkYear”:”-1″,

  179. “beginTime”:””,

  180. “endTime”:””,

  181. “isMember”:”-1″,

  182. “hopeAdressStr”:””,

  183. “cityId”:”-1″,

  184. “updateTime”:””,

  185. “tradeId”:””,

  186. “clientNo”:””,

  187. “userToken”:self.token,

  188. “clientType”:”2″

  189. }

  190. retry = 0

  191. whileTrue:

  192. #The capture packet obtains the real URL of the request, followed by the random number string.

  193. search_url = “http://qiye.zhaopingou.com/zhaopingou_interface/find_warehouse_by_position_new?timestamp=” + str(int(time.time() * 1000))

  194. l.info(‘search_url:{}’.format(search_url))

  195. self.current_url = search_url

  196. l.debug(f’Open search page. url,params,keyword,userToken: {search_url},{params},{keyword},{self.token}’)

  197. retry += 1

  198. if retry == 11:

  199. return”

  200. try:

  201. #Using session requests

  202. res = self.session.post(search_url, data=params)

  203. except ConnectionError:

  204. l.info(“ConnectionError! Sleep 5 minutes and retry…”)

  205. time.sleep(300)

  206. self.current_url = search_url

  207. continue

  208. else:

  209. l.info(‘current url is:{}’.format(res.url))

  210. if res.url != search_url:

  211. login_result = self.login(load=False)

  212. if login_result:

  213. continue

  214. else:

  215. l.warning(“Login failed!”)

  216. sys.exit(‘login failed’)

  217. elifnot res.text:

  218. l.info(“Service is busy. Wait 5 minutes and retry…”)

  219. time.sleep(300)

  220. l.info(‘Continue Searching…’)

  221. continue

  222. #Returned data exceptions, little content

  223. elif len(str(res.text))<2000:

  224. #If you return, please check your resume after you log in, then crawl again after login.

  225. if’Please log in and review the resume’In str (res.text):

  226. self.login(load=False)

  227. continue

  228. result = str(res.text)

  229. #Replace useragent

  230. self.session.headers[‘User-Agent’] = ua.firefox

  231. l.info(f’errorcode msg:{result}’)

  232. l.info(‘Too frequent operation, please try again in a minute’)

  233. time.sleep(random.randint(61,100))

  234. continue

  235. else:

  236. try:

  237. #Returns normal data and gets JSON data through json.dumps ().

  238. resume_list = json.loads(res.text)

  239. resume_list[“current_page”]=page_to_go

  240. # Add a search page to the list page

  241. res = json.dumps(resume_list,ensure_ascii=False)

  242. l.info(f’search_resume_list_info:{res}’)

  243. return res

  244. except:

  245. l.warning(res.text)

  246. l.warning(“something wrong!sleep 5 minutes and retry…”)

  247. time.sleep(300)

  248. continue

  249. def open_resume(self, url):

  250. ””’

  251. Open your resume and get the details page

  252. urlUser ID construction that can be encrypted by Base64

  253. ”’

  254. l = logging

  255. l.debug(f’Open a resume: request_url: {url}’)

  256. resumeHtmlId=(url.split(“=”))[1]

  257. # Pre set chain

  258. #self.session.headers[‘Referer’] = “http://qiye.zhaopingou.com/resume?key=”+self.keyword

  259. # Capture the request parameters of the resume details page

  260. open_resume_data={

  261. “resumeHtmlId”: resumeHtmlId,

  262. “keyStr”:””,

  263. “keyPositionName”:””,

  264. “tradeId”:””,

  265. “postionStr”:””,

  266. “jobId”:”0″,

  267. “companyName”:””,

  268. “schoolName”:””,

  269. “clientNo”:””,

  270. “userToken”:self.token,

  271. “clientType”:”2″

  272. }

  273. retry = 0

  274. whileTrue:

  275. #Grab packet to get the details page real URL

  276. openresumeurl = “http://qiye.zhaopingou.com/zhaopingou_interface/zpg_find_resume_html_details?timestamp=” + str(int(time.time() * 1000))

  277. l.info(‘resume_url:{}’.format(openresumeurl))

  278. retry += 1

  279. if retry == 11:

  280. return”

  281. try:

  282. res = self.session.post(url=openresumeurl,data=open_resume_data)

  283. except ConnectionError:

  284. l.info(“ConnectionError! Sleep 5 minutes and retry…”)

  285. time.sleep(300)

  286. continue

  287. else:

  288. # The returned HTML page

  289. l.info(‘current url is:{}’.format(res.url))

  290. if res.url != openresumeurl:

  291. l.info(“cookie is invalid. Login with webdriver”)

  292. login_result = self.login(load=False)

  293. if login_result:

  294. continue

  295. else:

  296. l.warning(“Login failed!”)

  297. sys.exit(‘login failed’)

  298. ifnot res.text:

  299. l.info(“Service is busy. Wait 5 minutes and retry…”)

  300. time.sleep(300)

  301. continue

  302. elif len(str(res.text))<2000:

  303. print(‘errorcode:’,res.text)

  304. result = str(res.text)

  305. l.info(f’errorcode msg:{result}’)

  306. l.info(‘Too frequent operation, please try again in a minute’)

  307. time.sleep(random.randint(61, 100))

  308. continue

  309. else:

  310. try:

  311. page_len = len(res.text)

  312. self.current_url = openresumeurl

  313. l.info(f’Downloaded a resume, len: {page_len:,d}, current_url: {url}’)

  314. resp_json=json.loads(res.text)

  315. res_utf=json.dumps(resp_json,ensure_ascii=False)

  316. return res_utf

  317. except:

  318. l.warning(res.text)

  319. l.warning(“something wrong! sleep 5 minutes and retry…”)

  320. time.sleep(300)

  321. continue

  322. if __name__ == ‘__main__’:

  323. #Account password is fake, you fill in your account password.

  324. rtc_zhaopingou = RTC_zhaopingou(account={‘user_id’: ‘-701’, ‘username’: ‘13419696888’, ‘password’: ‘123’},

  325. debug=False,

  326. visible=1, last_try=False)

  327. rtc_zhaopingou.login()

  328. keyword_list = [‘python’,’Big data ‘,’ artificial intelligence ‘,’java’]

  329. for kw in keyword_list:

  330. for i in range(1,200):

  331. search_result = rtc_zhaopingou.search(kw, i)

  332. print(‘****************************************************************’)

  333. res = rtc_zhaopingou.open_resume(‘ http://qiye.zhaopingou.com/resume/detail?resumeId=5761920’)

  334. print(res)

The code of the rabbit platform can be downloaded by itself, and can be run at the same directory.

I hope to help you!

Welcome to my blog: https://home.cnblogs.com/u/Python1234/

Welcome to join thousands of people to exchange the answer group:

Similar Posts:

Leave a Reply

Your email address will not be published. Required fields are marked *