@@ -0,0 +1,115 @@ #récupère les liens issus du rapport Google Search Console #parse la liste de BL, cherche le lien vers notre site sur la page, cherche l'ancre #en sortie un rapport avec la répartition des ancres import csv import requests import argparse from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin from collections import namedtuple, defaultdict from concurrent.futures import ThreadPoolExecutor #on commence par récupérer les URL dans le fichier CSV def csvtolist(gsc_csv_file) : with open(gsc_csv_file) as f : f_csv = csv.reader(f) next(f_csv) #on vire le header links = [line[0] for line in f_csv] # liste qui va contenir les BL return links def linktocrawl(liste_url): c = 0 #Fonction NoFollow def isNofollow(link) : if 'nofollow' in str(link) or 'Nofollow' in str(link) : return True return False #check des liens internes def is_internal(url,start_url): u = urlparse(url) s = urlparse(start_url) if (u.netloc == s.netloc): return True return False #Ecriture dans un CSV de sortie def out_csv(url_property_list) : with open('out.csv', 'w', newline='') as f: f_writer = csv.writer(f) header = 'domain,link,anchor,is_no_follow,internal_outlinks,external_outlinks' f_writer.writerow(header.split(' ')) for url_property in url_property_list: f_writer.writerow(url_property) #Transformation de la fonction en class class myGscCrawler(object) : def __init__(self,linklist, domain): self.linklist = linklist self.domain = domain self.count_timeout = 0 self.count_connect_error = 0 self.result = [] self.Url_property = namedtuple('Url_property', 'domain, link, anchor, is_no_follow, internal_outlinks, external_outlinks') def check_link(self, url): # logique de check d'URL try: print('URL to crawl :', url) r = requests.get(url, verify=False) except requests.exceptions.Timeout: print('soucis de TimeOut') self.count_timeout += 1 except requests.exceptions.ConnectionError: print('Erreur de Connection') self.count_connect_error += 1 soup = BeautifulSoup(r.text, 'lxml') ndd = urlparse(url).netloc internalLinks = 0 externalLinks = 0 if r.status_code != 200: # On zappe les pages mortes pass list_links_ok = [] for l in soup.body.find_all('a'): if not l.has_attr('href'): continue u = urljoin(url, l['href']) u_parse = urlparse(u) if is_internal(url, l['href']): internalLinks += 1 else: externalLinks += 1 list_links_ok.append(l) for l in list_links_ok : u = urljoin(url, l['href']) u_parse = urlparse(u) if domain in u_parse.netloc: print(url, l['href']) self.result.append(self.Url_property(ndd, url, l.string, isNofollow(l), internalLinks, externalLinks)) def check_all(self): # on va gérer le pool ici pool = ThreadPoolExecutor(128) with pool as executor : jobs = [executor.submit(self.check_link, url) for url in self.linklist] print('timeout :', self.count_timeout, 'Erreur de Connexions :', self.count_connect_error) return self.result if __name__ == "__main__": """On réalise un crawler avec requests et bs4 pour les liens dans la GSC""" parser = argparse.ArgumentParser() parser.add_argument('-d', '--domain', required = True, help = "Le domaine analysé") parser.add_argument('-l', '--list', required = True, help ="le fichier exporté depuis GSC") args = parser.parse_args() domain = args.domain links = csvtolist(args.list) test = myGscCrawler(links, domain) gsclinks = test.check_all() out_csv(gsclinks)