Php скрипт создания sitemap

Php класс для автоматического создания sitemap.xml

Sitemaps — XML-файл с информацией для поисковых систем о страницах веб-сайта, которые подлежат индексации. Sitemaps может помочь поисковикам определить местонахождение страниц сайта, время их последнего обновления, частоту обновления и важность относительно других страниц сайта для того, чтобы поисковая машина смогла более разумно индексировать сайт. Существует кучу разных способом создания заветного файла sitemap.xml. Есть кучу сервисов, которые могут создать sitemap.xml. Можно и самому написать генератор «налету» или же складывать готовые файлы и скармливать их ботам. Сайт описания формата http://www.sitemaps.org .

Использование протокола Sitemaps не является гарантией того, что веб-страницы будут проиндексированы поисковыми системами, это всего лишь дополнительная подсказка для сканеров, которые смогут выполнить более тщательное сканирование сайта

Для тех — кто хочет спарсить свой сайт в sitemap.xml (разово или по крону) подойдет этот класс:

check = $ignore_list; > //set a proxy host and port (such as someproxy:8080 or 10.1.1.1:8080 public function set_proxy($host_port)< $this->proxy = $host_port; > //validating urls using list of substrings private function validate($url)< $valid = true; //add substrings of url that you don't want to appear using set_ignore() method foreach($this->check as $val) < if(stripos($url, $val) !== false) < $valid = false; break; >> return $valid; > //multi curl requests private function multi_curl($urls)< // for curl handlers $curl_handlers = array(); //setting curl handlers foreach ($urls as $url) < $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); if (isset($this->proxy) && !$this->proxy == '') < curl_setopt($curl, CURLOPT_PROXY, $this->proxy); > $curl_handlers[] = $curl; > //initiating multi handler $multi_curl_handler = curl_multi_init(); // adding all the single handler to a multi handler foreach($curl_handlers as $key => $curl) < curl_multi_add_handle($multi_curl_handler,$curl); >// executing the multi handler do < $multi_curl = curl_multi_exec($multi_curl_handler, $active); >while ($multi_curl == CURLM_CALL_MULTI_PERFORM || $active); foreach($curl_handlers as $curl) < //checking for errors if(curl_errno($curl) == CURLE_OK) < //if no error then getting content $content = curl_multi_getcontent($curl); //parsing content $this->parse_content($content); > > curl_multi_close($multi_curl_handler); return true; > //function to call public function get_links($domain)< //getting base of domain url address $this->base = str_replace("http://", "", $domain); $this->base = str_replace("https://", "", $this->base); $host = explode("/", $this->base); $this->base = $host[0]; //getting proper domain name and protocol $this->domain = trim($domain); if(strpos($this->domain, "http") !== 0) < $this->protocol = "http://"; $this->domain = $this->protocol.$this->domain; > else < $protocol = explode("//", $domain); $this->protocol = $protocol[0]."//"; > if(!in_array($this->domain, $this->sitemap_urls)) < $this->sitemap_urls[] = $this->domain; > //requesting link content using curl $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $this->domain); if (isset($this->proxy) && !$this->proxy == '') < curl_setopt($curl, CURLOPT_PROXY, $this->proxy); > curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); $page = curl_exec($curl); curl_close($curl); $this->parse_content($page); > //parses content and checks for URLs private function parse_content($page)< //getting all links from href attributes preg_match_all("/]*href\s*=\s*'([^']*)'|". ']*href\s*=\s*"([^"]*)"'."/is", $page, $match); //storing new links $new_links = array(); for($i = 1; $i < sizeof($match); $i++) < //walking through links foreach($match[$i] as $url) < //if doesn't start with http and is not empty if(strpos($url, "http") === false && trim($url) !== "") < //checking if absolute path if($url[0] == "/") $url = substr($url, 1); //checking if relative path else if($url[0] == ".") < while($url[0] != "/") < $url = substr($url, 1); >$url = substr($url, 1); > //transforming to absolute url $url = $this->protocol.$this->base."/".$url; > //if new and not empty if(!in_array($url, $this->sitemap_urls) && trim($url) !== "") < //if valid url if($this->validate($url)) < //checking if it is url from our domain if(strpos($url, "http://".$this->base) === 0 || strpos($url, "https://".$this->base) === 0) < //adding url to sitemap array $this->sitemap_urls[] = $url; //adding url to new link array $new_links[] = $url; > > > > > $this->multi_curl($new_links); return true; > //returns array of sitemap URLs public function get_array()< return $this->sitemap_urls; > //notifies services like google, bing, yahoo, ask and moreover about your site map update public function ping($sitemap_url, $title ="", $siteurl = "") < // for curl handlers $curl_handlers = array(); $sitemap_url = trim($sitemap_url); if(strpos($sitemap_url, "http") !== 0) < $sitemap_url = "http://".$sitemap_url; >$site = explode("//", $sitemap_url); $start = $site[0]; $site = explode("/", $site[1]); $middle = $site[0]; if(trim($title) == "") < $title = $middle; >if(trim($siteurl) == "") < $siteurl = $start."//".$middle; >//urls to ping $urls[0] = "http://www.google.com/webmasters/tools/ping?sitemap=".urlencode($sitemap_url); $urls[1] = "http://www.bing.com/webmaster/ping.aspx?siteMap=".urlencode($sitemap_url); $urls[2] = "http://search.yahooapis.com/SiteExplorerService/V1/updateNotification". "?appid=YahooDemo&url=".urlencode($sitemap_url); $urls[3] = "http://submissions.ask.com/ping?sitemap=".urlencode($sitemap_url); $urls[4] = "http://rpc.weblogs.com/pingSiteForm?name=".urlencode($title). "&url=".urlencode($siteurl)."&changesURL=".urlencode($sitemap_url); //setting curl handlers foreach ($urls as $url) < $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURL_HTTP_VERSION_1_1, 1); $curl_handlers[] = $curl; >//initiating multi handler $multi_curl_handler = curl_multi_init(); // adding all the single handler to a multi handler foreach($curl_handlers as $key => $curl) < curl_multi_add_handle($multi_curl_handler,$curl); >// executing the multi handler do < $multi_curl = curl_multi_exec($multi_curl_handler, $active); >while ($multi_curl == CURLM_CALL_MULTI_PERFORM || $active); // check if there any error $submitted = true; foreach($curl_handlers as $key => $curl) < //you may use curl_multi_getcontent($curl); for getting content //and curl_error($curl); for getting errors if(curl_errno($curl) != CURLE_OK) < $submitted = false; >> curl_multi_close($multi_curl_handler); return $submitted; > //generates sitemap public function generate_sitemap()< $sitemap = new SimpleXMLElement(''); foreach($this->sitemap_urls as $url) < $url_tag = $sitemap->addChild("url"); $url_tag->addChild("loc", htmlspecialchars($url)); > return $sitemap->asXML(); > > ?>

Пример работы с php классом для генерации карты сайты:

set_ignore(array("javascript:", ".css", ".js", ".ico", ".jpg", ".png", ".jpeg", ".swf", ".gif")); //ссылка Вашего сайта: $sitemap->get_links("http://diamond-center.com.ua"); //если нужно вернуть просто массив с данными: //$arr = $sitemap->get_array(); //echo "
"; //print_r($arr); //echo "

"; header ("content-type: text/xml"); $map = $sitemap->generate_sitemap(); echo $map; ?>

Источник

Формирование файла sitemap.xml

Предполагается что в таблице стаей есть даты публикации и последнего изменения
( `date_add` и `date_edit` ).

// Подключение к БД. $dbh = new PDO('mysql:dbname=db_name;host=localhost', 'ЛОГИН', 'ПАРОЛЬ'); $out = ''; // Получение статей из БД. $sth = $dbh->prepare("SELECT * FROM `articles`"); $sth->execute(); $articles = $sth->fetchAll(PDO::FETCH_ASSOC); foreach ($articles as $row) < // Дата изменения статьи. $date = max(array($row['date_add'], $row['date_edit'])); $out .= ' https://example.com/articles/' . $row['id'] . '.html ' . date('Y-m-d', $date) . ' ' . ((($date + 604800) > time()) ? '1' : '0.5') . ' '; > $out .= ''; header('Content-Type: text/xml; charset=utf-8'); echo $out; exit();

Вариант на DOMDocument

// Подключение к БД. $dbh = new PDO('mysql:dbname=db_name;host=localhost', 'логин', 'пароль'); $dom = new DOMDocument('1.0', 'utf-8'); $urlset = $dom->createElement('urlset'); $urlset->setAttribute('xmlns','http://www.sitemaps.org/schemas/sitemap/0.9'); // Получение статей из БД. $sth = $dbh->prepare("SELECT * FROM `articles`"); $sth->execute(); $articles = $sth->fetchAll(PDO::FETCH_ASSOC); foreach($articles as $row) < // Дата изменения статьи. $date = max(array($row['date_add'], $row['date_edit'])); $url = $dom->createElement('url'); // Элемент - URL статьи. $loc = $dom->createElement('loc'); $text = $dom->createTextNode( htmlentities('https://example.com/articles/' . $row['id'] . '.html', ENT_QUOTES) ); $loc->appendChild($text); $url->appendChild($loc); // Элемент - дата последнего изменения статьи. $lastmod = $dom->createElement('lastmod'); $text = $dom->createTextNode(date('Y-m-d', $date)); $lastmod->appendChild($text); $url->appendChild($lastmod); // Элемент - приоритетность (от 0 до 1.0, по умолчанию 0.5). // Если дата публикации/изменения статьи была меньше недели назад ставим приоритет 1. $priority = $dom->createElement('priority'); $text = $dom->createTextNode((($date + 604800) > time()) ? '1' : '0.5'); $priority->appendChild($text); $url->appendChild($priority); $urlset->appendChild($url); > $dom->appendChild($urlset); // Сохранение в файл. $dom->save(__DIR__ . '/sitemap.xml'); // Или отправка в браузер. header('Content-Type: text/xml'); echo $dom->saveXML(); exit();

Результат:

   https://example.com/articles/16.html 2016-11-19 1  https://example.com/articles/3.html 2016-11-08 0.5  

Перенаправление с .php на .xml

Например данный скрипт лежит в корне сайта с именем sitemap.php, вы хотите чтобы он открывался по адресу https://example.com/sitemap.xml . В файле .htaccess нужно добавить запись:

RewriteEngine On RewriteRule ^sitemap.xml$ sitemap.php [L,QSA]

Если скрипт сделан модулем CMS:

RewriteEngine On RewriteCond % ^/sitemap.xml RewriteRule (.*) /index.php?module=sitemap [L,QSA]

Sitemap в robots.txt

Чтобы поисковые системы знали где расположен файл карты сайта нужно добавить директиву Sitemap в robots.txt:

User-agent: * Sitemap: https://example.com/sitemap.xml

Источник

Saved searches

Use saved searches to filter your results more quickly

You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session. You switched accounts on another tab or window. Reload to refresh your session.

PHP script for generating a XML sitemap for your website

tristangoossens/php-sitemap-generator

This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?

Sign In Required

Please sign in to use Codespaces.

Launching GitHub Desktop

If nothing happens, download GitHub Desktop and try again.

Launching GitHub Desktop

If nothing happens, download GitHub Desktop and try again.

Launching Xcode

If nothing happens, download Xcode and try again.

Launching Visual Studio Code

Your codespace will open once ready.

There was a problem preparing your codespace, please try again.

Latest commit

Git stats

Files

Failed to load latest commit information.

README.md

Object based PHP script that generates a XML sitemap with the given config options. I made this script because I wanted to automate making a sitemap for google indexing and because there were not a lot of open source sitemap generators out there.

Feel free to help me implement any of the missing features or add extra features

  • Generate a sitemap for your website
  • Multiple options for generating sitemaps
  • Option to only look through certain filetypes
  • Load client side Javascript content when crawling
  • Parse all relative link types (// , # , ?) and more

Installing this script is simply just downloading both sitemap_config and sitemap_generator and placing them into your project(same directory).

After installing the script you can use the script by including it into your script

include "/path/to/sitemap-generator.php";

And initializing the class by calling the constructor

// Create an object of the generator class passing the config file $smg = new SitemapGenerator(include("sitemap-config.php")); // Run the generator $smg->GenerateSitemap();

You can alter some of the configs settings by changing the config values.

// Site to crawl and create a sitemap for. // https://www.your-domain-name.com/ or http://www.your-domain-name.com/ "SITE_URL" => "https://student-laptop.nl/", // Boolean for crawling external links. // *Domain = https://www.student-laptop.nl* , *Link = https://www.google.com* "ALLOW_EXTERNAL_LINKS" => false, // Boolean for crawling element id links. // will not be crawled when this option is set to false "ALLOW_ELEMENT_LINKS" => false, // If set the crawler will only index the anchor tags with the given id. // If you wish to crawl all links set the value to "" //  When CRAWL_ANCHORS_WITH_ID is set to "internal-link" this link will be crawled // but  will not be crawled. "CRAWL_ANCHORS_WITH_ID" => "", // Array with absolute links or keywords for the pages to skip when crawling the given SITE_URL. // https://student-laptop.nl/info/laptops or you can just input student-laptop.nl/info/ and it will not crawl anything in that directory // Try to be as specific as you can so you dont skip 300 pages "KEYWORDS_TO_SKIP" => array( "http://localhost/student-laptop/index", // I already have a href for root ("/") on my page so skip this page "/student-laptop/student-laptop.nl/", // Invalid link example ), // Location + filename where the sitemap will be saved. "SAVE_LOC" => "sitemap.xml", // Static priority value for sitemap "PRIORITY" => 1, // Static update frequency "CHANGE_FREQUENCY" => "daily", // Date changed (today's date) "LAST_UPDATED" => date('Y-m-d'),

Example output when generating a sitemap using this script

xml version="1.0" encoding="UTF-8"?> urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">  3 total links-->  PHP-sitemap-generator by https://github.com/tristangoossens --> url> loc>https://student-laptop.nl/loc> lastmod>2021-03-10lastmod> changefreq>dailychangefreq> priority>1priority> url> url> loc>https://student-laptop.nl/underConstructionloc> lastmod>2021-03-10lastmod> changefreq>dailychangefreq> priority>1priority> url> url> loc>https://student-laptop.nl/article?article_id=1loc> lastmod>2021-03-10lastmod> changefreq>dailychangefreq> priority>1priority> url> urlset>

About

PHP script for generating a XML sitemap for your website

Источник

Читайте также:  Ajax search filter in php
Оцените статью