f1# coding:utf-8f1# coding:utf-8
2import copy2import copy
3import sys3import sys
4import os4import os
5import time5import time
6import requests6import requests
7from lxml import etree7from lxml import etree
8from selenium import webdriver8from selenium import webdriver
9# from selenium.webdriver.chrome.options import Options9# from selenium.webdriver.chrome.options import Options
10from selenium.common.exceptions import ElementNotInteractableException10from selenium.common.exceptions import ElementNotInteractableException
11import pymongo11import pymongo
12import uuid12import uuid
1313
14BASE_DIR = os.path.dirname(os.path.abspath(__file__))14BASE_DIR = os.path.dirname(os.path.abspath(__file__))
15sys.path.insert(0, os.path.dirname(BASE_DIR))15sys.path.insert(0, os.path.dirname(BASE_DIR))
1616
17from incre_web_crawler.constants import (USER_AGENT, RHEL7_URL,17from incre_web_crawler.constants import (USER_AGENT, RHEL7_URL,
18                                         RHEL8_URL, REDHAT_DOMAIN,18                                         RHEL8_URL, REDHAT_DOMAIN,
19                                         RHEL8_STORAGE_DIR, TODAY,19                                         RHEL8_STORAGE_DIR, TODAY,
20                                         RHEL7_STORAGE_DIR,20                                         RHEL7_STORAGE_DIR,
21                                         LOGIN_URL, USERNAME,21                                         LOGIN_URL, USERNAME,
22                                         PASSWORD)22                                         PASSWORD)
23from incre_web_crawler.logger import Logger23from incre_web_crawler.logger import Logger
24from incre_web_crawler.utils import time_it, retry, dict2str24from incre_web_crawler.utils import time_it, retry, dict2str
25from incre_web_crawler.parse_config import get_config, update_config, get_current_time, update_start_crawl_time, \25from incre_web_crawler.parse_config import get_config, update_config, get_current_time, update_start_crawl_time, \
26    update_end_crawl_time, get_rhel_version26    update_end_crawl_time, get_rhel_version
2727
28logger = Logger(log_name=r".\log\RedHatSpider.log", log_level=1, logger="RedHatSpider").get_log()28logger = Logger(log_name=r".\log\RedHatSpider.log", log_level=1, logger="RedHatSpider").get_log()
2929
3030
31class IncrementalWebCrawler:31class IncrementalWebCrawler:
32    """增量式RedHat爬虫"""32    """增量式RedHat爬虫"""
3333
34    def __init__(self, login_url, username, password):34    def __init__(self, login_url, username, password):
35        chrome_options = webdriver.ChromeOptions()35        chrome_options = webdriver.ChromeOptions()
36        # 静默模式36        # 静默模式
37        chrome_options.add_argument('--headless')37        chrome_options.add_argument('--headless')
38        # 解决 ERROR:browser_switcher_service.cc(238) 报错,添加下面的试用选项38        # 解决 ERROR:browser_switcher_service.cc(238) 报错,添加下面的试用选项
39        chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])39        chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
4040
41        chrome_options.add_argument('--disable-gpu')41        chrome_options.add_argument('--disable-gpu')
42        chrome_options.add_argument('--disable-infobars')42        chrome_options.add_argument('--disable-infobars')
43        chrome_options.add_argument('--incognito')43        chrome_options.add_argument('--incognito')
44        chrome_options.add_argument('--blink-settings=imagesEnabled=false')44        chrome_options.add_argument('--blink-settings=imagesEnabled=false')
45        chrome_options.add_argument('--disable-extensions')45        chrome_options.add_argument('--disable-extensions')
46        chrome_options.add_argument('--start-maximized')46        chrome_options.add_argument('--start-maximized')
47        chrome_options.add_argument('--hide-scrollbars')47        chrome_options.add_argument('--hide-scrollbars')
48        chrome_options.add_argument('user-agent={useragent}'.format(useragent=USER_AGENT))48        chrome_options.add_argument('user-agent={useragent}'.format(useragent=USER_AGENT))
49        self.driver = webdriver.Chrome(options=chrome_options)49        self.driver = webdriver.Chrome(options=chrome_options)
50        self.rhel7_base_url = RHEL7_URL50        self.rhel7_base_url = RHEL7_URL
51        self.rhel8_base_url = RHEL8_URL51        self.rhel8_base_url = RHEL8_URL
52        self.login_url = login_url52        self.login_url = login_url
53        self.username = username53        self.username = username
54        self.password = password54        self.password = password
55        self.failed_urls = list()55        self.failed_urls = list()
56        # 是否是首次爬56        # 是否是首次爬
57        self.is_first = get_config()57        self.is_first = get_config()
58        # 创建客户端58        # 创建客户端
59        self.client = pymongo.MongoClient(host='localhost', port=27017)59        self.client = pymongo.MongoClient(host='localhost', port=27017)
60        self.ver_nos = list()60        self.ver_nos = list()
61        self.urls = list()61        self.urls = list()
6262
63    @retry(reNum=5)63    @retry(reNum=5)
64    def login_red_website(self):64    def login_red_website(self):
65        """登录red网站"""65        """登录red网站"""
66        try:66        try:
67            self.driver.get(self.login_url)67            self.driver.get(self.login_url)
68        except Exception as e:68        except Exception as e:
69            raise e69            raise e
70        logger.info(f"login_title: {self.driver.title}")70        logger.info(f"login_title: {self.driver.title}")
71        time.sleep(5)71        time.sleep(5)
72        try:72        try:
73            # 输入用户名73            # 输入用户名
74            self.driver.find_element_by_xpath("//div[@class='field']/input[@id='username']").send_keys(self.username)74            self.driver.find_element_by_xpath("//div[@class='field']/input[@id='username']").send_keys(self.username)
75            self.driver.find_element_by_xpath(75            self.driver.find_element_by_xpath(
76                '//div[@class="centered form-buttons"]/button[@class="centered button heavy-cta"]').click()76                '//div[@class="centered form-buttons"]/button[@class="centered button heavy-cta"]').click()
77            time.sleep(2)77            time.sleep(2)
7878
79            # 输入密码79            # 输入密码
80            self.driver.find_element_by_xpath("//div[@id='passwordWrapper']/input[@id='password']").send_keys(80            self.driver.find_element_by_xpath("//div[@id='passwordWrapper']/input[@id='password']").send_keys(
81                self.password)81                self.password)
82            self.driver.find_element_by_xpath("//div[@id='kc-form-buttons']//input[@id='kc-login']").click()82            self.driver.find_element_by_xpath("//div[@id='kc-form-buttons']//input[@id='kc-login']").click()
83            time.sleep(5)83            time.sleep(5)
84        except ElementNotInteractableException as e:84        except ElementNotInteractableException as e:
85            raise e85            raise e
8686
87    # @retry(reNum=5)87    # @retry(reNum=5)
88    def get_all_rhel_urls(self, url):88    def get_all_rhel_urls(self, url):
89        """获取所有下载链接"""89        """获取所有下载链接"""
90        try:90        try:
91            self.driver.get(url)91            self.driver.get(url)
92        except Exception as e:92        except Exception as e:
93            logger.error(e)93            logger.error(e)
94        time.sleep(8)94        time.sleep(8)
95        target_objs = self.driver.find_elements_by_xpath('//div[@class="option pull-left"][2]/select[@id="evr"]/option')95        target_objs = self.driver.find_elements_by_xpath('//div[@class="option pull-left"][2]/select[@id="evr"]/option')
96        version2urls = [{obj.get_attribute("text"): obj.get_attribute("value")} for obj in target_objs]96        version2urls = [{obj.get_attribute("text"): obj.get_attribute("value")} for obj in target_objs]
97        if not version2urls:97        if not version2urls:
98            self.get_all_rhel_urls(url)98            self.get_all_rhel_urls(url)
9999
100        return version2urls100        return version2urls
101101
102    @retry(reNum=5)102    @retry(reNum=5)
103    def get_target_page_cookie(self, url):103    def get_target_page_cookie(self, url):
104        """获取目标网页的cookie"""104        """获取目标网页的cookie"""
105        try:105        try:
106            self.driver.get(url)106            self.driver.get(url)
107        except Exception as e:107        except Exception as e:
108            logger.error(e)108            logger.error(e)
109        rh_jwt = self.driver.get_cookie("rh_jwt")109        rh_jwt = self.driver.get_cookie("rh_jwt")
110        session = self.driver.get_cookie("_redhat_downloads_session")110        session = self.driver.get_cookie("_redhat_downloads_session")
111        if all([rh_jwt, session]):111        if all([rh_jwt, session]):
112            logger.info(f"jwt: {rh_jwt}")112            logger.info(f"jwt: {rh_jwt}")
113            logger.info(f"session: {session}")113            logger.info(f"session: {session}")
114            rh_jwt_value = rh_jwt["value"]114            rh_jwt_value = rh_jwt["value"]
115            session_value = session["value"]115            session_value = session["value"]
116            time.sleep(5)116            time.sleep(5)
117            cookies = {117            cookies = {
118                "rh_user": "rd.sangfor|rd|P|",118                "rh_user": "rd.sangfor|rd|P|",
119                "rh_locale": "zh_CN",119                "rh_locale": "zh_CN",
120                "rh_user_id": "51768269",120                "rh_user_id": "51768269",
121                "rh_sso_session": "1",121                "rh_sso_session": "1",
122                "rh_jwt": rh_jwt_value,122                "rh_jwt": rh_jwt_value,
123                "_redhat_downloads_session": session_value123                "_redhat_downloads_session": session_value
124            }124            }
125            cookie_str = dict2str(cookies)125            cookie_str = dict2str(cookies)
126            return cookie_str126            return cookie_str
127        else:127        else:
128            logger.info(f"{url} 链接获取cookie失败,请重新获取")128            logger.info(f"{url} 链接获取cookie失败,请重新获取")
129            self.failed_urls.append(url)129            self.failed_urls.append(url)
130130
131    @retry(reNum=5)131    @retry(reNum=5)
132    def save_target_data(self, cookie, target_url, filename):132    def save_target_data(self, cookie, target_url, filename):
133        """保存数据"""133        """保存数据"""
134        headers = {134        headers = {
135            "User-Agent": USER_AGENT,135            "User-Agent": USER_AGENT,
136            "Cookie": cookie,136            "Cookie": cookie,
137        }137        }
138        session_obj = requests.Session()138        session_obj = requests.Session()
139        try:139        try:
140            response = session_obj.get(target_url, headers=headers)140            response = session_obj.get(target_url, headers=headers)
141            wb_data = response.text141            wb_data = response.text
142            html = etree.HTML(wb_data)142            html = etree.HTML(wb_data)
143            need_data = html.xpath('//div[@class="changelog"]//text()')143            need_data = html.xpath('//div[@class="changelog"]//text()')
144            print("first element:{element}".format(element=need_data[0]))144            print("first element:{element}".format(element=need_data[0]))
145            if need_data:145            if need_data:
146                with open(filename, "w", encoding="utf-8", errors="ignore") as fp:146                with open(filename, "w", encoding="utf-8", errors="ignore") as fp:
147                    for data in need_data:147                    for data in need_data:
148                        fp.write(data)148                        fp.write(data)
149        except Exception as e:149        except Exception as e:
150            print(e)150            print(e)
151151
152    def get_rhel8_latest_data(self):152    def get_rhel8_latest_data(self):
153        """爬取最新的rhel8"""153        """爬取最新的rhel8"""
154154
155        # 先登录155        # 先登录
156        self.login_red_website()156        self.login_red_website()
157157
158        version2urls = self.get_all_rhel_urls(self.rhel8_base_url)158        version2urls = self.get_all_rhel_urls(self.rhel8_base_url)
159        url_suffix = [i for i in version2urls[0].values()][0]159        url_suffix = [i for i in version2urls[0].values()][0]
160        url = "".join([REDHAT_DOMAIN, url_suffix])160        url = "".join([REDHAT_DOMAIN, url_suffix])
161        ver_no = [i for i in version2urls[0].keys()][0]161        ver_no = [i for i in version2urls[0].keys()][0]
162        logger.info("===>>>开始爬取{ver_no}...".format(ver_no=ver_no))162        logger.info("===>>>开始爬取{ver_no}...".format(ver_no=ver_no))
163        cookie = self.get_target_page_cookie(url)163        cookie = self.get_target_page_cookie(url)
164        filename = "".join([RHEL8_STORAGE_DIR, str(TODAY), "-", ver_no, ".txt"])164        filename = "".join([RHEL8_STORAGE_DIR, str(TODAY), "-", ver_no, ".txt"])
165        self.save_target_data(cookie, url, filename)165        self.save_target_data(cookie, url, filename)
166        logger.info("===>>>{ver_no}更新日志已保存".format(ver_no=ver_no))166        logger.info("===>>>{ver_no}更新日志已保存".format(ver_no=ver_no))
167        self.driver.quit()167        self.driver.quit()
168168
169    def get_rhel7_latest_data(self):169    def get_rhel7_latest_data(self):
170        """爬取最新的rhel7"""170        """爬取最新的rhel7"""
171171
172        # 先登录172        # 先登录
173        self.login_red_website()173        self.login_red_website()
174        version2urls = self.get_all_rhel_urls(self.rhel7_base_url)174        version2urls = self.get_all_rhel_urls(self.rhel7_base_url)
175        url_suffix = [i for i in version2urls[0].values()][0]175        url_suffix = [i for i in version2urls[0].values()][0]
176        url = "".join([REDHAT_DOMAIN, url_suffix])176        url = "".join([REDHAT_DOMAIN, url_suffix])
177        ver_no = [i for i in version2urls[0].keys()][0]177        ver_no = [i for i in version2urls[0].keys()][0]
178        logger.info("===>>>开始爬取{ver_no}...".format(ver_no=ver_no))178        logger.info("===>>>开始爬取{ver_no}...".format(ver_no=ver_no))
179        cookie = self.get_target_page_cookie(url)179        cookie = self.get_target_page_cookie(url)
180        filename = "".join([RHEL7_STORAGE_DIR, str(TODAY), "-", ver_no, ".txt"])180        filename = "".join([RHEL7_STORAGE_DIR, str(TODAY), "-", ver_no, ".txt"])
181        self.save_target_data(cookie, url, filename)181        self.save_target_data(cookie, url, filename)
182        logger.info("===>>>{ver_no}更新日志已保存".format(ver_no=ver_no))182        logger.info("===>>>{ver_no}更新日志已保存".format(ver_no=ver_no))
183        self.driver.quit()183        self.driver.quit()
184184
185    def save_url_to_mongodb(self, items, rhel_ver):185    def save_url_to_mongodb(self, items, rhel_ver):
186        """将url保存至mongodb"""186        """将url保存至mongodb"""
187        logger.info(f"pymongo version: {pymongo.version}")187        logger.info(f"pymongo version: {pymongo.version}")
188        # 指定数据库,如果没有则会自动创建188        # 指定数据库,如果没有则会自动创建
189        db = self.client.redhat189        db = self.client.redhat
190        if rhel_ver == "rhel8":190        if rhel_ver == "rhel8":
191            # 集合:就是数据表的概念191            # 集合:就是数据表的概念
192            collection = db.centos8_table192            collection = db.centos8_table
193        else:193        else:
194            # 有,修改,没有,创建194            # 有,修改,没有,创建
195            collection = db.centos7_table195            collection = db.centos7_table
196196
197        datas = list()197        datas = list()
198        for ver_no, url in items:198        for ver_no, url in items:
199            # mongo会自动创建id, 不需要自己创建199            # mongo会自动创建id, 不需要自己创建
200            # url_id = uuid.uuid4().__str__()200            # url_id = uuid.uuid4().__str__()
201            data = {201            data = {
202                # "id": url_id,202                # "id": url_id,
203                "ver_no": ver_no,203                "ver_no": ver_no,
204                "url": url,204                "url": url,
205            }205            }
206            datas.append(data)206            datas.append(data)
207        try:207        try:
208            collection.insert_many(datas)208            collection.insert_many(datas)
209        except TypeError as ex:209        except TypeError as ex:
210            logger.error(ex)210            logger.error(ex)
211211
212    def query_url_by_kw(self, url, ver_no):212    def query_url_by_kw(self, url, ver_no):
213        """根据关键字查询mongodb中的url"""213        """根据关键字查询mongodb中的url"""
214        # 指定数据库,如果没有则会自动创建214        # 指定数据库,如果没有则会自动创建
215        db = self.client.redhat215        db = self.client.redhat
216        collection = db.centos8_table216        collection = db.centos8_table
217        item = {217        item = {
218            "url": url,218            "url": url,
219            "ver_no": ver_no219            "ver_no": ver_no
220        }220        }
221        db_data = collection.find_one(item)221        db_data = collection.find_one(item)
222        return db_data222        return db_data
223223
224    def query_all_db_objs(self, rhel_ver):224    def query_all_db_objs(self, rhel_ver):
225        """查询所有的mongodb对象"""225        """查询所有的mongodb对象"""
226        try:226        try:
227            db = self.client.redhat227            db = self.client.redhat
228            if rhel_ver == "rhel8":228            if rhel_ver == "rhel8":
229                collection = db.centos8_table229                collection = db.centos8_table
230            else:230            else:
231                collection = db.centos7_table231                collection = db.centos7_table
232            db_objs = collection.find()232            db_objs = collection.find()
233        except Exception as ex:233        except Exception as ex:
234            logger.error(ex)234            logger.error(ex)
235        else:235        else:
236            return db_objs236            return db_objs
237237
238    def query_all_ver_nos(self, rhel_ver):238    def query_all_ver_nos(self, rhel_ver):
239        """查询所有ver_no"""239        """查询所有ver_no"""
240        db_objs = self.query_all_db_objs(rhel_ver)240        db_objs = self.query_all_db_objs(rhel_ver)
241        ver_nos = list()241        ver_nos = list()
242        for obj in db_objs:242        for obj in db_objs:
243            ver_no = [i for i in obj.values()][1]243            ver_no = [i for i in obj.values()][1]
244            ver_nos.append(ver_no)244            ver_nos.append(ver_no)
245        return ver_nos245        return ver_nos
246246
247    def get_rhel_urls(self, rhel_ver):247    def get_rhel_urls(self, rhel_ver):
248        """获取rhel所有url"""248        """获取rhel所有url"""
249        # 先登录,登录后携带cookie进行抓取数据249        # 先登录,登录后携带cookie进行抓取数据
250        self.login_red_website()250        self.login_red_website()
251        if rhel_ver == "rhel8":251        if rhel_ver == "rhel8":
252            version2urls = self.get_all_rhel_urls(self.rhel8_base_url)252            version2urls = self.get_all_rhel_urls(self.rhel8_base_url)
253        else:253        else:
254            version2urls = self.get_all_rhel_urls(self.rhel7_base_url)254            version2urls = self.get_all_rhel_urls(self.rhel7_base_url)
255255
256        for item in version2urls:256        for item in version2urls:
257            url_suffix = [i for i in item.values()][0]257            url_suffix = [i for i in item.values()][0]
258            ver_no = [i for i in item.keys()][0]258            ver_no = [i for i in item.keys()][0]
259            url = "".join([REDHAT_DOMAIN, url_suffix])259            url = "".join([REDHAT_DOMAIN, url_suffix])
260            self.ver_nos.append(ver_no)260            self.ver_nos.append(ver_no)
261            self.urls.append(url)261            self.urls.append(url)
262262
263        # 实际上只存ver_no即可263        # 实际上只存ver_no即可
264        # item = {ver_no:url}  这种数据结构更好一点264        # item = {ver_no:url}  这种数据结构更好一点
265        # 下面这种递归慎用265        # 下面这种递归慎用
266        # if all([ver_nos, urls]):266        # if all([ver_nos, urls]):
267        #     return zip(ver_nos, urls)267        #     return zip(ver_nos, urls)
268        # else:268        # else:
269        #     self.get_rhel8_urls()269        #     self.get_rhel8_urls()
270        logger.info(self.ver_nos)270        logger.info(self.ver_nos)
271        # 不需要了,直接下面调用self.urls即可271        # 不需要了,直接下面调用self.urls即可
272        # if self.ver_nos and self.urls:272        # if self.ver_nos and self.urls:
273        #     obj = zip(self.ver_nos, self.urls)273        #     obj = zip(self.ver_nos, self.urls)
274        #     return obj274        #     return obj
275        # else:275        # else:
276        #     import pdb;pdb.set_trace()276        #     import pdb;pdb.set_trace()
277        #     return False277        #     return False
278278
279    def craw_data(self, items, save_path):279    def craw_data(self, items, save_path):
280        """抓取并保存数据"""280        """抓取并保存数据"""
281        for ver_no, url in items:281        for ver_no, url in items:
282            # 非首次爬282            # 非首次爬
283            # 将url和mongodb中的进行对比283            # 将url和mongodb中的进行对比
284            # db_data = self.query_url_by_kw(url, ver_no)284            # db_data = self.query_url_by_kw(url, ver_no)
285            # if not db_data:285            # if not db_data:
286            # 说明这一条url是最新的,只爬取这一条286            # 说明这一条url是最新的,只爬取这一条
287            # 处理的不好,使用集合进行去重,差集的处理更好287            # 处理的不好,使用集合进行去重,差集的处理更好
288            logger.info("===>>>开始爬取{ver_no}...".format(ver_no=ver_no))288            logger.info("===>>>开始爬取{ver_no}...".format(ver_no=ver_no))
289            cookie = self.get_target_page_cookie(url)289            cookie = self.get_target_page_cookie(url)
290            filename = "".join([save_path, str(TODAY), "-", ver_no, ".txt"])290            filename = "".join([save_path, str(TODAY), "-", ver_no, ".txt"])
291            self.save_target_data(cookie, url, filename)291            self.save_target_data(cookie, url, filename)
292            logger.info("===>>>{ver_no}更新日志已保存".format(ver_no=ver_no))292            logger.info("===>>>{ver_no}更新日志已保存".format(ver_no=ver_no))
293            time.sleep(5)293            time.sleep(5)
294294
295        # 注意这个driver退出的位置295        # 注意这个driver退出的位置
296        self.driver.quit()296        self.driver.quit()
297297
298    def get_latest_rhel_data(self, items, save_path):298    def get_latest_rhel_data(self, items, save_path):
299        """爬取最新的"""299        """爬取最新的"""
300        self.craw_data(items, save_path)300        self.craw_data(items, save_path)
301301
302    def get_all_rhel_data(self, rhel_ver):302    def get_all_rhel_data(self, rhel_ver):
303        """爬取所有rhel的数据"""303        """爬取所有rhel的数据"""
304        if rhel_ver == "rhel8":304        if rhel_ver == "rhel8":
305            save_path = RHEL8_STORAGE_DIR305            save_path = RHEL8_STORAGE_DIR
306        else:306        else:
307            save_path = RHEL7_STORAGE_DIR307            save_path = RHEL7_STORAGE_DIR
308308
309        self.get_rhel_urls(rhel_ver)309        self.get_rhel_urls(rhel_ver)
310        # items = zip(self.ver_nos, self.urls)310        # items = zip(self.ver_nos, self.urls)
311        # 取出网页爬取的真实url-ver_no311        # 取出网页爬取的真实url-ver_no
312        objs = [(ver_no, url) for ver_no, url in zip(self.ver_nos, self.urls)]312        objs = [(ver_no, url) for ver_no, url in zip(self.ver_nos, self.urls)]
313        ver_nos = list()313        ver_nos = list()
314        for obj in objs:314        for obj in objs:
315            ver_no = obj[0]315            ver_no = obj[0]
316            ver_nos.append(ver_no)316            ver_nos.append(ver_no)
317317
318        if self.is_first == "false":318        if self.is_first == "false":
319            # 这里的问题:查询的时候应该根据指定的版本号去对应的表查询319            # 这里的问题:查询的时候应该根据指定的版本号去对应的表查询
320            db_ver_nos = self.query_all_ver_nos(rhel_ver)320            db_ver_nos = self.query_all_ver_nos(rhel_ver)
321            logger.info(f"当前网页的版本号: {ver_nos}")321            logger.info(f"当前网页的版本号: {ver_nos}")
322            logger.info(f"数据库中已有版本号: {db_ver_nos}")322            logger.info(f"数据库中已有版本号: {db_ver_nos}")
323            # 取差集323            # 取差集
324            latest_ver_nos = list(set(ver_nos) - set(db_ver_nos))324            latest_ver_nos = list(set(ver_nos) - set(db_ver_nos))
325            logger.info(f"最新的版本号: {latest_ver_nos}")325            logger.info(f"最新的版本号: {latest_ver_nos}")
326            latest_items = [obj for obj in objs if obj[0] in latest_ver_nos]326            latest_items = [obj for obj in objs if obj[0] in latest_ver_nos]
327            logger.info(objs)327            logger.info(objs)
328            logger.info(f"最新的版本号和url链接: {latest_items}")328            logger.info(f"最新的版本号和url链接: {latest_items}")
329            # 后续爬取的就是差集部分这些329            # 后续爬取的就是差集部分这些
330            # 然后爬完也要把数据存入mongodb330            # 然后爬完也要把数据存入mongodb
331            self.get_latest_rhel_data(latest_items, save_path)331            self.get_latest_rhel_data(latest_items, save_path)
332            self.save_url_to_mongodb(latest_items, rhel_ver)332            self.save_url_to_mongodb(latest_items, rhel_ver)
333            return333            return
334334
335        self.craw_data(zip(self.ver_nos, self.urls), save_path)335        self.craw_data(zip(self.ver_nos, self.urls), save_path)
336        # if self.failed_urls:336        # if self.failed_urls:
337        #     self.get_lost_data()337        #     self.get_lost_data()
338        # import pdb338        # import pdb
339        # pdb.set_trace()339        # pdb.set_trace()
340        # 第一次爬完,将url存入mongodb340        # 第一次爬完,将url存入mongodb
341        self.save_url_to_mongodb(zip(self.ver_nos, self.urls), rhel_ver)341        self.save_url_to_mongodb(zip(self.ver_nos, self.urls), rhel_ver)
342        # 标识量置为False,但是再次执行脚本,这里还是会为True, 无法持久化保存342        # 标识量置为False,但是再次执行脚本,这里还是会为True, 无法持久化保存
343        # 所以:办法 1、存入数据库这个变量 2、存入配置文件343        # 所以:办法 1、存入数据库这个变量 2、存入配置文件
344        self.is_first = "false"344        self.is_first = "false"
345        update_config(self.is_first)345        update_config(self.is_first)
346346
347    def get_all_rhel7_data(self):347    def get_all_rhel7_data(self):
348        """爬取所有rhel7的数据"""348        """爬取所有rhel7的数据"""
349349
350        # 先登录350        # 先登录
351        self.login_red_website()351        self.login_red_website()
352        version2urls = self.get_all_rhel_urls(self.rhel7_base_url)352        version2urls = self.get_all_rhel_urls(self.rhel7_base_url)
353353
354        for item in version2urls:354        for item in version2urls:
355            url_suffix = [i for i in item.values()][0]355            url_suffix = [i for i in item.values()][0]
356            url = "".join([REDHAT_DOMAIN, url_suffix])356            url = "".join([REDHAT_DOMAIN, url_suffix])
357            ver_no = [i for i in item.keys()][0]357            ver_no = [i for i in item.keys()][0]
358            logger.info("===>>>开始爬取{ver_no}...".format(ver_no=ver_no))358            logger.info("===>>>开始爬取{ver_no}...".format(ver_no=ver_no))
359            cookie = self.get_target_page_cookie(url)359            cookie = self.get_target_page_cookie(url)
360            filename = "".join([RHEL7_STORAGE_DIR, str(TODAY), "-", ver_no, ".txt"])360            filename = "".join([RHEL7_STORAGE_DIR, str(TODAY), "-", ver_no, ".txt"])
361            self.save_target_data(cookie, url, filename)361            self.save_target_data(cookie, url, filename)
362            logger.info("===>>>{ver_no}更新日志已保存".format(ver_no=ver_no))362            logger.info("===>>>{ver_no}更新日志已保存".format(ver_no=ver_no))
363            time.sleep(5)363            time.sleep(5)
364        self.driver.quit()364        self.driver.quit()
365        logger.info("============开始爬取失败的url==========================")365        logger.info("============开始爬取失败的url==========================")
366        if self.failed_urls:366        if self.failed_urls:
367            self.get_lost_data()367            self.get_lost_data()
368368
369    def get_lost_data(self):369    def get_lost_data(self):
370        """爬取丢失的数据"""370        """爬取丢失的数据"""
371        if self.failed_urls:371        if self.failed_urls:
372            self.login_red_website()372            self.login_red_website()
373            for url in self.failed_urls:373            for url in self.failed_urls:
374                ver_no = url.split("/")[-4]374                ver_no = url.split("/")[-4]
375                logger.info("===>>>开始爬取{ver_no}...".format(ver_no=ver_no))375                logger.info("===>>>开始爬取{ver_no}...".format(ver_no=ver_no))
376                cookie = self.get_target_page_cookie(url)376                cookie = self.get_target_page_cookie(url)
377                filename = "".join([RHEL7_STORAGE_DIR, str(TODAY), "-", ver_no, ".txt"])377                filename = "".join([RHEL7_STORAGE_DIR, str(TODAY), "-", ver_no, ".txt"])
378                self.save_target_data(cookie, url, filename)378                self.save_target_data(cookie, url, filename)
379                logger.info("===>>>{ver_no}更新日志已保存".format(ver_no=ver_no))379                logger.info("===>>>{ver_no}更新日志已保存".format(ver_no=ver_no))
380                time.sleep(5)380                time.sleep(5)
381            self.driver.quit()381            self.driver.quit()
382        else:382        else:
383            pass383            pass
384384
385385
386@time_it386@time_it
387def main():387def main():
388    red_spider = IncrementalWebCrawler(LOGIN_URL, USERNAME, PASSWORD)388    red_spider = IncrementalWebCrawler(LOGIN_URL, USERNAME, PASSWORD)
389    rhel_ver = get_rhel_version()389    rhel_ver = get_rhel_version()
390    red_spider.get_all_rhel_data(rhel_ver)390    red_spider.get_all_rhel_data(rhel_ver)
391    # red_spider.get_lost_data()391    # red_spider.get_lost_data()
392392
393393
394if __name__ == '__main__':394if __name__ == '__main__':
395    start_time = get_current_time()395    start_time = get_current_time()
396    update_start_crawl_time(start_time)396    update_start_crawl_time(start_time)
397    main()397    main()
398    end_time = get_current_time()398    end_time = get_current_time()
399    update_end_crawl_time(end_time)399    update_end_crawl_time(end_time)
tt400 
401 
402 
Legends
Colors
 Added 
Changed
Deleted
Links
(f)irst change
(n)ext change
(t)op