In questo intervento ho trattato l'utilizzo di strumenti e API per automatizzare attività quotidiane di analisi e monitoraggio.
View Slide
▪▪▪
My First Headinglink1link2testo/html/body/a[1]/@href
/axis::node-test[predicate]/axis::node-test[predicate]/axis::node-test[predicate]▪→→→/locationstep/locationstep/locationstep
/html/body/a[1]/@href/child::html/child::body/child::a[1]/attribute::href
▪esempio di contenuto▪esempio di contenuto▪//nome-tag/@attributo
▪→→▪→ contains(str1, str2)→ starts-with(str1, str2)/html/body/a[1]/@href//a[1]/@href//a[contains(@href, "link1.html")]//a[starts-with(@href, "link1")]
▪//*[@id="menu-item-5015"]/a▪//ul[@id="menu-primary-items"]/li/a
=IMPORTXML(url, xpath_query)▪▪
=XPathOnUrl(url, xpath, attribute, xmlHTTPSettings, mode)▪▪▪
▪▪
from lxml import htmlimport requestsurls = open("urls.txt", "r")results_file = open("results.txt", "a+")for item in urls:url = item.rstrip("\n")page = requests.get(url)tree = html.fromstring(page.content)text = tree.xpath('//ul[@id="menu-primary-items"]/li/a/text()')results_file.write("%s,%s\n" % (url, text))print ("SCRAPING " + url)print (text, "\n")results_file.close()▪▪▪▪▪
//title//meta[@name="description"]/@content//link[@hreflang="it-IT"]/@href//link[contains(@hreflang, *)]/@href//link[@rel="canonical"]/@href//meta[@name="robots"]/@content//h1//url/loc/text()
▪→→→▪
https://www.googleapis.com/analytics/v3/data/ga?ids=ga:XXXX&start-date=2019-01-01&end-date=2019-03-31&metrics=ga:sessions&filters=ga:country==Italy&access_token=XXXXhttps://www.googleapis.com/analytics/v3/data/ga?ids=ga:XXXX&start-date=2019-01-01&end-date=2019-03-31&metrics=ga:sessions&filters=ga:country==Italy&access_token=XXXX▪▪▪
https://www.googleapis.com/analytics/v3/data/ga?ids=ga:XXXX&start-date=2019-01-01&end-date=2019-03-31&metrics=ga:sessions&filters=ga:country==Italy
https://www.googleapis.com/analytics/v3/data/ga?ids=ga:XXXX&start-date=2019-01-01&end-date=2019-03-31&metrics=ga:sessions&filters=ga:country==Italy▪▪▪
https://www.googleapis.com/analytics/v3/data/ga?ids=ga:XXXX&start-date=2019-01-01&end-date=2019-03-31&metrics=ga:sessions&filters=ga:country==Italy▪▪▪▪
▪▪▪ga:name operator expressionga:country == Italy
▪▪→→→→
▪▪https://www.googleapis.com/webmasters/v3/sites/XXXX/searchAnalytics/query{"startDate": "2019-01-01","endDate": "2019-03-31","dimensions": ["query"],"dimensionFilterGroups": [{"filters": [{"dimension": "country","operator": "equals","expression": "ITA"}]}],"aggregationType": "auto","rowLimit": 25000"startRow": 0}
https://www.googleapis.com/webmasters/v3/sites/XXXX/searchAnalytics/query{"startDate": "2019-01-01","endDate": "2019-03-31","dimensions": ["query"],"dimensionFilterGroups": [{"filters": [{"dimension": "country","operator": "equals","expression": "ITA"}]}],"aggregationType": "auto","rowLimit": 25000"startRow": 0}
{"startDate": "2019-01-01","endDate": "2019-03-31","dimensions": ["query"],"dimensionFilterGroups": [{"filters": [{"dimension": "country","operator": "equals","expression": "ITA"}]}],"aggregationType": "auto","rowLimit": 25000"startRow": 0}▪▪▪https://www.googleapis.com/webmasters/v3/sites/XXXX/searchAnalytics/query
{"startDate": "2019-01-01","endDate": "2019-03-31","dimensions": ["query"],"dimensionFilterGroups": [{"filters": [{"dimension": "country","operator": "equals","expression": "ITA"}]}],"aggregationType": "auto","rowLimit": 25000"startRow": 0}▪▪https://www.googleapis.com/webmasters/v3/sites/XXXX/searchAnalytics/query
▪▪▪"dimension": string, "operator": string, "expression": string"dimension": country, "operator": equals, "expression": ITA
{"startDate": "2019-01-01","endDate": "2019-03-31","dimensions": ["query"],"dimensionFilterGroups": [{"filters": [{"dimension": "country","operator": "equals","expression": "ITA"}]}],"aggregationType": "auto","rowLimit": 25000"startRow": 0}▪▪→→https://www.googleapis.com/webmasters/v3/sites/XXXX/searchAnalytics/query
...rowLimit = 25000retrieve_search_queries = webmasters_service.searchanalytics().query(siteUrl='ENTER-YOURS-HERE',body={"startDate": "2019-01-01","endDate": "2019-03-31","dimensions": ["query"],"dimensionFilterGroups": [{"filters": [{"dimension": "country","operator": "equals","expression": "ITA"}]}],"aggregationType": "auto","rowLimit": rowLimit}).execute()results_file = open("results.txt", "a+")for i in range(0, rowLimit):keys = retrieve_search_queries['rows'][i]['keys']impressions = retrieve_search_queries['rows'][i]['impressions']clicks = retrieve_search_queries['rows'][i]['clicks']ctr = retrieve_search_queries['rows'][i]['ctr']position = retrieve_search_queries['rows'][i]['position']print ("%s|%s|%s|%s|%s\n" % (keys, impressions, clicks, ctr, position))results_file.write ("%s|%s|%s|%s|%s\n" % (keys, impressions, clicks, ctr, position))results_file.close()
▪▪▪https://adwords.google.com/api/adwords/cm/v201809/CampaignServicexmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/"xmlns:xsd="http://www.w3.org/2001/XMLSchema"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">......
▪▪▪→→
▪▪▪→→→−−→
▪▪→→
▪▪→→→
▪▪▪▪▪→→→
...def main(client, item, ad_group_id=None):# Initialize appropriate service.targeting_idea_service = client.GetService('TargetingIdeaService', version='v201809')# Construct selector object and retrieve related keywords.selector = {'ideaType': 'KEYWORD','requestType': 'STATS'}selector['requestedAttributeTypes'] = ['KEYWORD_TEXT', 'SEARCH_VOLUME']offset = 0selector['paging'] = {'startIndex': str(offset),'numberResults': str(PAGE_SIZE)}selector['searchParameters'] = [{'xsi_type': 'RelatedToQuerySearchParameter','queries': item}]# Language setting (optional).selector['searchParameters'].append({# The ID can be found in the documentation:# https://developers.google.com/adwords/api/docs/appendix/languagecodes'xsi_type': 'LanguageSearchParameter','languages': [{'id': '1004'}]})# Location setting (optional).selector['searchParameters'].append({# The ID can be found in the documentation:# https://developers.google.com/adwords/api/docs/appendix/geotargeting'xsi_type': 'LocationSearchParameter','locations': [{'id': '2380'}]})# Network search parameter (optional)selector['searchParameters'].append({'xsi_type': 'NetworkSearchParameter','networkSetting': {'targetGoogleSearch': True,'targetSearchNetwork': False,'targetContentNetwork': False,'targetPartnerSearchNetwork': False}})▪▪▪▪▪▪▪
...# Display results.if 'entries' in page:for result in page['entries']:attributes = {}for attribute in result['data']:attributes[attribute['key']] = getattr(attribute['value'], 'value', '0')results_file.write('%s|%s|%s\n' % (item, attributes['KEYWORD_TEXT'], attributes['SEARCH_VOLUME']))print ('%s|%s|%s' % (item, attributes['KEYWORD_TEXT'], attributes['SEARCH_VOLUME']))printelse:print ('No related keywords were found.')offset += PAGE_SIZEselector['paging']['startIndex'] = str(offset)more_pages = offset < int(page['totalNumEntries'])if __name__ == '__main__':# Initialize client object.adwords_client = adwords.AdWordsClient.LoadFromStorage("ABSOLUTE-PATH-TO-googleads.yaml")adwords_client.SetClientCustomerId('ENTER-YOURS-HERE')kwds = open("kwds.txt","r")#reload(sys)#sys.setdefaultencoding('utf-8')for line in kwds:item = line.strip()results_file = open("results.txt", "a+")main(adwords_client, item, int(AD_GROUP_ID) if AD_GROUP_ID.isdigit() else None)print(datetime.datetime.now())results_file.close()sleep(2)▪▪▪▪
...# Construct selector object and retrieve relatedkeywords.selector = {'ideaType': 'KEYWORD','requestType': ‘IDEAS'}selector['requestedAttributeTypes'] = ['KEYWORD_TEXT', 'SEARCH_VOLUME']offset = 0selector['paging'] = {'startIndex': str(offset),'numberResults’: 10}...
from lxml import htmlimport requestsurls = open("urls.txt", "r")results_file = open("results.txt", "w")for item in urls:url = item.rstrip("\n")page = requests.get(url)tree = html.fromstring(page.content)text = tree.xpath('//h3[@class="r"]/a/@href')results_file.write("%s,%s\n" % (url, text))print ("SCRAPING " + url)print (text, "\n")results_file.close()▪▪▪▪
▪▪▪▪https://www.google.[com]/search?q=site:[dominio]&start=[#pagina]&...
▪▪▪/url?q=http://www.simpleagency.it/&sa=U&ved=0ahUKEwizuOnv1YTiAhU9GLkGHQUZAe8QFggUMAA&usg=AOvVaw2SLUR7xqI7OaMms1_bXQ3h
...#download and store new html fileos.rename('/home/giancampo/diff-html/new_html.html','/home/giancampo/diff-html/old_html.html')url = ‘YOUR-HOMEPAGE-URL'response = urllib2.urlopen(url)webContent = response.read()f = open('/home/giancampo/diff-html/new_html.html', 'w')f.write(webContent)f.close()#convert html to txt fileshtml1 = open('/home/giancampo/diff-html/old_html.html').read()html2 = open('/home/giancampo/diff-html/new_html.html').read()old_file = html2text.html2text(html1)new_file = html2text.html2text(html2)#write text into txt filesold_text = open('/home/giancampo/diff-html/old_text.txt', 'w')new_text = open('/home/giancampo/diff-html/new_text.txt', 'w')old_text.write(old_file)new_text.write(new_file)old_text.close()new_text.close()...▪▪
...#send an email if the script has found differencesif filecmp.cmp('/home/giancampo/diff-html/old_text.txt','/home/giancampo/diff-html/new_text.txt') == True:print 'no emails sent'else:gmail_user = ‘YOUR-GMAIL-ADDRESS'gmail_password = YOUR-GMAIL-PASSWORD'sent_from = gmail_userto = ['[email protected]']subject = 'Changes in the homepage!'body = _diffemail_text = '''From: %s\nTo: %s\nSubject: %s\n\n%s''' % (sent_from,', '.join(to), subject, body)server = smtplib.SMTP_SSL('smtp.gmail.com', 465)server.ehlo()server.login(gmail_user, gmail_password)server.sendmail(sent_from, to, email_text)server.close()print 'Email sent!'#files closingdiff_file.close()▪▪▪
▪▪▪→→▪
▪▪...if __name__ == '__main__':# Initialize client object.adwords_client =adwords.AdWordsClient.LoadFromStorage("C:\\Users\\gianl\\AppData\\Local\\Programs\\Python\\Python37\\_i mieiscript\\adwords-api\\googleads.yaml")adwords_client.SetClientCustomerId('ENTER-YOURS-HERE')kwds = open("kwds.txt","r")reload(sys)sys.setdefaultencoding('utf-8')for line in kwds:item = line.strip()results_file = open("results.txt", "a+")main(adwords_client, item, int(AD_GROUP_ID) ifAD_GROUP_ID.isdigit() else None)print(datetime.datetime.now())results_file.close()sleep(2)