diff -ur --strip-trailing-cr piwik.orig/core/Common.php piwik/core/Common.php --- piwik.orig/core/Common.php 2016-05-23 11:15:11.000000000 +0900 +++ piwik/core/Common.php 2016-08-13 15:13:40.930493900 +0900 @@ -14,6 +14,7 @@ use Piwik\Intl\Data\Provider\RegionDataProvider; use Piwik\Plugins\UserCountry\LocationProvider\DefaultProvider; use Piwik\Tracker\Cache as TrackerCache; +use Piwik\Plugins\Referrers\SearchEngine; /** * Contains helper methods used by both Piwik Core and the Piwik Tracking engine. @@ -322,6 +323,7 @@ // $_GET and $_REQUEST already urldecode()'d // decode // note: before php 5.2.7, htmlspecialchars() double encodes &#x hex items + $value = SearchEngine::getInstance()->convertCharsetByDefault($value); $value = html_entity_decode($value, self::HTML_ENCODING_QUOTE_STYLE, 'UTF-8'); $value = self::sanitizeNullBytes($value); diff -ur --strip-trailing-cr piwik.orig/core/Tracker/PageUrl.php piwik/core/Tracker/PageUrl.php --- piwik.orig/core/Tracker/PageUrl.php 2016-04-01 10:07:54.000000000 +0900 +++ piwik/core/Tracker/PageUrl.php 2016-08-13 15:13:40.930493900 +0900 @@ -216,6 +216,33 @@ return $originalUrl; } + /* + * Detect UTF-8, which mysql understands. + * + * @param string $string The string being detected. + * @return boolean + */ + private static function isValidUtf8($string) + { + // Reference: http://tools.ietf.org/html/rfc3629 + $regex = '/(?:' + . '[\x00-\x7f]|' // U+0000 - U+007F + . '[\xc2-\xdf][\x80-\xbf]|' // U+0080 - U+07FF + . '\xe0[\xa0-\xbf][\x80-\xbf]|' // U+0800 - U+0FFF + . '[\xe1-\xec][\x80-\xbf][\x80-\xbf]|' // U+1000 - U+CFFF + . '\xed[\x80-\x9f][\x80-\xbf]|' // U+D000 - U+D7FF + . '[\xee-\xef][\x80-\xbf][\x80-\xbf]|' // U+E000 - U+FFFF + # . '\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|' // U+10000 - U+3FFFF perhaps, mysql does not understand. + # . '[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]|' // U+40000 - U+FFFFF perhaps, mysql does not understand. + # . '\xf4[\x80-\x8f][\x80-\xbf][\x80-\xbf]|' // U+100000 - U+10FFFF perhaps, mysql does not understand. + . ')/'; + $result = preg_replace($regex, '', $string); + if ($result !== '') { + return FALSE; + } + return TRUE; + } + /** * Clean up string contents (filter, truncate, ...) * @@ -224,11 +251,14 @@ */ public static function cleanupString($string) { + if (!PageUrl::isValidUtf8($string)) { + $string = ''; + } $string = trim($string); $string = str_replace(array("\n", "\r", "\0"), '', $string); $limit = Config::getInstance()->Tracker['page_maximum_length']; - $clean = substr($string, 0, $limit); + $clean = Common::mb_substr($string, 0, $limit); return $clean; } diff -ur --strip-trailing-cr piwik.orig/plugins/Referrers/SearchEngine.php piwik/plugins/Referrers/SearchEngine.php --- piwik.orig/plugins/Referrers/SearchEngine.php 2016-07-14 10:48:53.000000000 +0900 +++ piwik/plugins/Referrers/SearchEngine.php 2016-08-13 15:13:40.930493900 +0900 @@ -24,6 +24,7 @@ /** @var string location of definition file (relative to PIWIK_INCLUDE_PATH) */ const DEFINITION_FILE = '/vendor/piwik/searchengine-and-social-list/SearchEngines.yml'; + const DEFAULT_CHARSETS_BY_HOST = 'search.yahoo.co.jp'; protected $definitionList = null; @@ -388,6 +389,23 @@ return $host; } + private function getDefaultCharsets() + { + $searchEngine = $this->getDefinitionByHost(SearchEngine::DEFAULT_CHARSETS_BY_HOST); + $charsets = array(); + if (!empty($searchEngine) and !empty($searchEngine['charsets'])) { + $charsets = $searchEngine['charsets']; + } + return($charsets); + } + + public function convertCharsetByDefault($string) + { + $charsets = $this->getDefaultCharsets(); + $string = $this->convertCharset($string, $charsets); + return($string); + } + /** * Tries to convert the given string from one of the given charsets to UTF-8 * @param string $string @@ -396,6 +414,9 @@ */ protected function convertCharset($string, $charsets) { + if (empty($charsets)) { + $charsets = $this->getDefaultCharsets(); + } if (function_exists('iconv') && !empty($charsets) ) { diff -ur --strip-trailing-cr piwik.orig/vendor/piwik/searchengine-and-social-list/SearchEngines.yml piwik/vendor/piwik/searchengine-and-social-list/SearchEngines.yml --- piwik.orig/vendor/piwik/searchengine-and-social-list/SearchEngines.yml 2016-07-14 02:20:30.000000000 +0900 +++ piwik/vendor/piwik/searchengine-and-social-list/SearchEngines.yml 2016-08-13 15:19:12.434500740 +0900 @@ -8,6 +8,15 @@ backlink: 's/{k}' charsets: - iso-8859-2 +118 700: + - + urls: + - www.118700.se + - foretag.118700.se + - webben.118700.se + params: + - q + backlink: 'sok.aspx?q={k}' 123people: - urls: @@ -74,6 +83,13 @@ params: - q backlink: 's.py?q={k}' +Allaverksamheter: + - + urls: + - www.allaverksamheter.se + params: + - What + backlink: 'SearchResult.aspx?What={k}' Alexa: - urls: @@ -265,6 +281,10 @@ params: - q backlink: '?q={k}' + charsets: + - utf-8 + - euc-jp + - ms932 auone Images: - urls: @@ -272,6 +292,10 @@ params: - q backlink: '?q={k}' + charsets: + - utf-8 + - euc-jp + - ms932 Austronaut: - urls: @@ -319,6 +343,10 @@ params: - q backlink: 'cgi-bin/search-st?q={k}' + charsets: + - utf-8 + - euc-jp + - ms932 Biglobe Images: - urls: @@ -1230,6 +1258,13 @@ params: - q backlink: 'szukaj?q={k}' +Isodelen: + - + urls: + - www.isodelen.se + params: + - Keywords + backlink: 'sokresultat?Keywords={k}' Ixquick: - urls: @@ -1650,6 +1685,13 @@ - query - words backlink: 'search?query={k}' +Riksdelen: + - + urls: + - www.riksdelen.se + params: + - What + backlink: 'SearchResult.aspx?What={k}' Road Runner: - urls: @@ -2245,6 +2287,10 @@ - p - vp backlink: 'search?p={k}' + charsets: + - utf-8 + - euc-jp + - ms932 - urls: - jp.hao123.com @@ -2267,6 +2313,10 @@ params: - p backlink: 'search?p={k}' + charsets: + - utf-8 + - euc-jp + - ms932 Yahoo! Japan Videos: - urls: @@ -2274,8 +2324,12 @@ params: - p backlink: 'search?p={k}' + charsets: + - utf-8 + - euc-jp + - ms932 Yam: - - + - urls: - search.yam.com params: diff -ur --strip-trailing-cr piwik.orig/vendor/piwik/searchengine-and-social-list/Socials.yml piwik/vendor/piwik/searchengine-and-social-list/Socials.yml --- piwik.orig/vendor/piwik/searchengine-and-social-list/Socials.yml 2016-07-14 02:20:30.000000000 +0900 +++ piwik/vendor/piwik/searchengine-and-social-list/Socials.yml 2016-08-13 15:19:15.001500794 +0900 @@ -216,3 +216,12 @@ tumblr: - tumblr.com + +mixi: + - mixi.jp + +lang-8: + - lang-8.com + +gree: + - gree.jp