bridges/DeveloppezDotComBridge.php

<?php

class DeveloppezDotComBridge extends FeedExpander
{

	const MAINTAINER = 'Binnette';
	const NAME = 'Developpez.com Actus (FR)';
	const URI = 'https://www.developpez.com/';
	const DOMAIN = '.developpez.com/';
	const RSS_URL = 'index/rss';
	const CACHE_TIMEOUT = 1800; // 30min
	const DESCRIPTION = 'Returns complete posts from developpez.com';
	// Encodings used by Developpez.com in their articles body
	const ENCONDINGS = array('Windows-1252', 'UTF-8');
	const PARAMETERS = array(
		array(
			'limit' => array(
				'name' => 'Max items',
				'type' => 'number',
				'defaultValue' => 5,
			),
			// list of the differents RSS availables
			'domain' => array(
				'type' => 'list',
				'name' => 'Domaine',
				'title' => 'Chosissez un sous-domaine',
				'values' => array(
					'= Domaine principal =' => 'www',
					'4d' => '4d',
					'abbyy' => 'abbyy',
					'access' => 'access',
					'agile' => 'agile',
					'ajax' => 'ajax',
					'algo' => 'algo',
					'alm' => 'alm',
					'android' => 'android',
					'apache' => 'apache',
					'applications' => 'applications',
					'arduino' => 'arduino',
					'asm' => 'asm',
					'asp' => 'asp',
					'aspose' => 'aspose',
					'bacasable' => 'bacasable',
					'big-data' => 'big-data',
					'bpm' => 'bpm',
					'bsd' => 'bsd',
					'business-intelligence' => 'business-intelligence',
					'c' => 'c',
					'cloud-computing' => 'cloud-computing',
					'club' => 'club',
					'cms' => 'cms',
					'cpp' => 'cpp',
					'crm' => 'crm',
					'css' => 'css',
					'd' => 'd',
					'dart' => 'dart',
					'data-science' => 'data-science',
					'db2' => 'db2',
					'delphi' => 'delphi',
					'dotnet' => 'dotnet',
					'droit' => 'droit',
					'eclipse' => 'eclipse',
					'edi' => 'edi',
					'embarque' => 'embarque',
					'emploi' => 'emploi',
					'etudes' => 'etudes',
					'excel' => 'excel',
					'firebird' => 'firebird',
					'flash' => 'flash',
					'go' => 'go',
					'green-it' => 'green-it',
					'gtk' => 'gtk',
					'hardware' => 'hardware',
					'hpc' => 'hpc',
					'humour' => 'humour',
					'ibmcloud' => 'ibmcloud',
					'intelligence-artificielle' => 'intelligence-artificielle',
					'interbase' => 'interbase',
					'ios' => 'ios',
					'java' => 'java',
					'javascript' => 'javascript',
					'javaweb' => 'javaweb',
					'jetbrains' => 'jetbrains',
					'jeux' => 'jeux',
					'kotlin' => 'kotlin',
					'labview' => 'labview',
					'laravel' => 'laravel',
					'latex' => 'latex',
					'lazarus' => 'lazarus',
					'linux' => 'linux',
					'mac' => 'mac',
					'matlab' => 'matlab',
					'megaoffice' => 'megaoffice',
					'merise' => 'merise',
					'microsoft' => 'microsoft',
					'mobiles' => 'mobiles',
					'mongodb' => 'mongodb',
					'mysql' => 'mysql',
					'netbeans' => 'netbeans',
					'nodejs' => 'nodejs',
					'nosql' => 'nosql',
					'objective-c' => 'objective-c',
					'office' => 'office',
					'open-source' => 'open-source',
					'openoffice-libreoffice' => 'openoffice-libreoffice',
					'oracle' => 'oracle',
					'outlook' => 'outlook',
					'pascal' => 'pascal',
					'perl' => 'perl',
					'php' => 'php',
					'portail-emploi' => 'portail-emploi',
					'portail-projets' => 'portail-projets',
					'postgresql' => 'postgresql',
					'powerpoint' => 'powerpoint',
					'preprod-emploi' => 'preprod-emploi',
					'programmation' => 'programmation',
					'project' => 'project',
					'purebasic' => 'purebasic',
					'pyqt' => 'pyqt',
					'python' => 'python',
					'qt-creator' => 'qt-creator',
					'qt' => 'qt',
					'r' => 'r',
					'raspberry-pi' => 'raspberry-pi',
					'reseau' => 'reseau',
					'ruby' => 'ruby',
					'rust' => 'rust',
					'sap' => 'sap',
					'sas' => 'sas',
					'scilab' => 'scilab',
					'securite' => 'securite',
					'sgbd' => 'sgbd',
					'sharepoint' => 'sharepoint',
					'solutions-entreprise' => 'solutions-entreprise',
					'spring' => 'spring',
					'sqlserver' => 'sqlserver',
					'stages' => 'stages',
					'supervision' => 'supervision',
					'swift' => 'swift',
					'sybase' => 'sybase',
					'symfony' => 'symfony',
					'systeme' => 'systeme',
					'talend' => 'talend',
					'typescript' => 'typescript',
					'uml' => 'uml',
					'unix' => 'unix',
					'vb' => 'vb',
					'vba' => 'vba',
					'virtualisation' => 'virtualisation',
					'visualstudio' => 'visualstudio',
					'web-semantique' => 'web-semantique',
					'web' => 'web',
					'webmarketing' => 'webmarketing',
					'wind' => 'wind',
					'windows-azure' => 'windows-azure',
					'windows' => 'windows',
					'windowsphone' => 'windowsphone',
					'word' => 'word',
					'xhtml' => 'xhtml',
					'xml' => 'xml',
					'zend-framework' => 'zend-framework'
				),
			)
		)
	);

	/**
	 * Return the RSS url for selected domain
	 */
	private function getRssUrl()
	{
		$domain = $this->getInput('domain');
		if (!empty($domain)) {
			return 'https://' . $domain . self::DOMAIN . self::RSS_URL;
		}

		return self::URI . self::RSS_URL;
	}

	/**
	 * Grabs the RSS item from Developpez.com
	 */
	public function collectData()
	{
		$url = $this->getRssUrl();
		$this->collectExpandableDatas($url, 20);
	}

	/**
	 * Parse the content of every RSS item. And will try to get the full article
	 * pointed by the item URL intead of the default abstract.
	 */
	protected function parseItem($newsItem)
	{
		if (count($this->items) >= $this->getInput('limit')) {
			return null;
		}

		// This function parse each entry in the RSS with the default parse
		$item = parent::parseItem($newsItem);

		// There is a bug in Developpez RSS, coma are writtent as '~?' in the
		// title, so I have to fix it manually
		$item['title'] = $this->fixComaInTitle($item['title']);

		// We get the content of the full article behind the RSS item URL
		$articleHTMLContent = getSimpleHTMLDOMCached($item['uri']);

		// Here we call our custom parser
		$fullText = $this->extractFullText($articleHTMLContent);
		if (!is_null($fullText)) {
			// if we manage to parse the page behind the url of the RSS item
			// then we set it as the new content. Otherwise we keep the default
			// content to avoid RSS Bridge to return an empty item
			$item['content'] = $fullText;
		}

		// Now we will attach video url in item
		$videosUrl = $this->getAllVideoUrl($articleHTMLContent);
		if (!empty($videosUrl)) {
			$item['enclosures'] = array_merge($item['enclosures'], $videosUrl);
		}

		// Now we can look for the blog writer/creator
		$author = $articleHTMLContent->find('[itemprop="creator"]', 0);
		if (!empty($author)) {
			$item['author'] = $author->outertext;
		}

		return $item;
	}

	/**
	 * Replace '~?' by a proper coma ','
	 */
	private function fixComaInTitle($txt)
	{
		return str_replace('~?', ',', $txt);
	}

	/**
	 * Return the full article pointed by the url in the RSS item
	 * Since Developpez.com only provides a short abstract of the article, we
	 * use the url to retrieve the complete article and return it as the content
	 */
	private function extractFullText($articleHTMLContent)
	{
		// All blog entry contains a div with the class 'content'. This div
		// contains the complete blog article. But the RSS can also return
		// announcement and not a blog article. So the next if, should take
		// care of the "non blog" entry
		$divArticleEntry = $articleHTMLContent->find('div.content', 0);
		if (is_null($divArticleEntry)) {
			// Didn't find the div with class content. It is probably not a blog
			// entry. It is probably just an announcement for an ebook, a PDF,
			// etc. So we can use the default RSS item content.
			return null;
		}

		// The following code is a bit hacky, but I really manage to get the
		// full content of articles without any encoding issues. What is very
		// weird and ugly in Developpez.com is the fact the some paragraphs of
		// the article will be encoded as UTF-8 and some other paragraphs will
		// be encoded as Windows-1252. So we can NOT decode the full article
		// with only one encoding. We have to check every paragraph and
		// determine its encoding

		// This contains all the 'paragraphs' of the article. It includes the
		// pictures, the text and the links at the bottom of the article
		$paragraphs = $divArticleEntry->nodes;
		// This will store the complete decoded content
		$fullText = '';

		// For each paragraph, we will identify the encoding, then decode it
		// and finally store the decoded content in $text
		foreach ($paragraphs as $paragraph) {
			// We have to recreate a new DOM document from the current node
			// otherwise the find function will look in the complet article and
			// not only in the current paragraph. This is an ugly behavior of
			// the library Simple HTML DOM Parser...
			$html = str_get_html($paragraph->outertext);
			$fullText .= $this->decodeParagraph($html);
		}

		// Finally we return the full 'well' enconded content of the article
		return $fullText;
	}

	/**
	 *
	 */
	private function decodeParagraph($p)
	{
		// First we check if this paragraph is a video
		$videoUrl = $this->getVideoUrl($p);
		if (!empty($videoUrl)) {
			// If this is a video, we just return a link to the video
			// &#128250; => 🎞️
			return	'<p>
						<b>&#128250; <a href="' . $videoUrl . '">Voir la vidéo</a></b>
					</p>';
		}

		// We take outertext to get the complete paragraph not only the text
		// inside it. That way we still graph block <img> and so on.
		$pTxt = $p->outertext;
		// This will store the decoded text if we manage to decode it
		$decodedTxt = '';

		// This is the only way to properly decode each paragraph. I tried
		// many stuffs but this is the only working way I found.
		foreach (self::ENCONDINGS as $enc) {
			// We check the encoding of the current paragraph
			if (mb_check_encoding($pTxt, $enc)) {
				// If the encoding is well recognized, we can convert from
				// this encoding to UTF-8
				$decodedTxt = iconv($enc, 'UTF-8', $pTxt);
			}
		}

		// We should not trim the strings to avoid the <a> to be glued to the
		// text like: the software<a href="...">started</a>to...
		if (!empty($decodedTxt)) {
			// We manage to decode the text, so we take the decoded version
			return $this->formatParagraph($decodedTxt);
		} else {
			// Otherwise we take the non decoded version and hope it will
			// be displayed not too ugly in the fulltext content
			return $this->formatParagraph($pTxt);
		}
	}

	/**
	 * Return true in $txt is a HTML tag and not plain text
	 */
	private function isHtmlTagNotTxt($txt)
	{
		$html = str_get_html($txt);
		return $html && $html->root && count($html->root->children) > 0;
	}

	/**
	 * Will add a space before paragraph when needed
	 */
	private function formatParagraph($txt)
	{
		// If the paragraph is an html tag, we add a space before
		if ($this->isHtmlTagNotTxt($txt)) {
			// the first element is an html tag and not a text, so we can add a
			// space before it
			return ' ' . $txt;
		}
		// If the text start with word (not punctation), we had a space
		$pattern = '/^\w/';
		if (preg_match($pattern, $txt)) {
			return ' ' . $txt;
		}
		return $txt;
	}

	/**
	 * Retrieve all video url in the article
	 */
	private function getAllVideoUrl($item)
	{
		// Array of video url
		$url = array();

		// Developpez use a div with the class video-container
		$divsVideo = $item->find('div.video-container');
		if (empty($divsVideo)) {
			return $url;
		}

		// get the url of the video
		foreach ($divsVideo as $div) {
			$html = str_get_html($div->outertext);
			$url[] = $this->getVideoUrl($html);
		}

		return $url;
	}

	/**
	 * Retrieve URL video. We have to check for the src of an iframe
	 * Work for Youtube. Will have to test for other video platform
	 */
	private function getVideoUrl($p)
	{
		$divVideo = $p->find('div.video-container', 0);
		if (empty($divVideo)) {
			return null;
		}
		$iframe = $divVideo->find('iframe', 0);
		if (empty($iframe)) {
			return null;
		}
		$src = trim($iframe->getAttribute('src'));
		if (empty($src)) {
			return null;
		}
		if (str_starts_with($src, '//')) {
			$src = 'https:' . $src;
		}
		return $src;
	}
}