diff -urN piwik.orig/core/DataFiles/SearchEngines.php piwik/core/DataFiles/SearchEngines.php --- piwik.orig/core/DataFiles/SearchEngines.php 2015-08-14 01:41:13.000000000 +0900 +++ piwik/core/DataFiles/SearchEngines.php 2015-11-12 17:00:20.922005741 +0900 @@ -56,6 +56,9 @@ */ if (!isset($GLOBALS['Piwik_SearchEngines'])) { $GLOBALS['Piwik_SearchEngines'] = array( + // default character code(s) + 'default' => array('', '', '', array('UTF-8', 'EUC-JP', 'MS932')), + // 1 '1.cz' => array('1.cz', array('/s\/([^\/]+)/', 'q'), 's/{k}', 'iso-8859-2'), @@ -197,9 +200,9 @@ 'searchatlas.centrum.cz' => array('Atlas', 'q', '?q={k}'), // auone - 'search.auone.jp' => array('auone', 'q', '?q={k}'), - 'sp-search.auone.jp' => array('auone'), - 'sp-image.search.auone.jp' => array('auone Images', 'q', '?q={k}'), + 'search.auone.jp' => array('auone', 'q', '?q={k}', array('UTF-8', 'EUC-JP', 'MS932')), + 'sp-search.auone.jp' => array('auone', '', '', array('UTF-8', 'EUC-JP', 'MS932')), + 'sp-image.search.auone.jp' => array('auone Images', 'q', '?q={k}', array('UTF-8', 'EUC-JP', 'MS932')), // Austronaut 'www2.austronaut.at' => array('Austronaut', 'q'), @@ -220,8 +223,8 @@ 'web.gougou.com' => array('Baidu', 'search', 'search?search={k}'), // uses baidu search // Biglobe - 'cgi.search.biglobe.ne.jp' => array('Biglobe', 'q', 'cgi-bin/search-st?q={k}'), - 'images.search.biglobe.ne.jp' => array('Biglobe Images', 'q', 'cgi-bin/search-st?q={k}'), + 'cgi.search.biglobe.ne.jp' => array('Biglobe', 'q', 'cgi-bin/search-st?q={k}', array('UTF-8', 'EUC-JP', 'MS932')), + 'images.search.biglobe.ne.jp' => array('Biglobe Images', 'q', 'cgi-bin/search-st?q={k}', array('UTF-8', 'EUC-JP', 'MS932')), // Bing 'bing.com' => array('Bing', array('q', 'Q'), 'search?q={k}'), @@ -341,7 +344,7 @@ 'search.excite.es' => array('Excite'), 'search.excite.nl' => array('Excite'), 'msxml.excite.com' => array('Excite', '/\/[^\/]+\/ws\/results\/[^\/]+\/([^\/]+)/'), - 'www.excite.co.jp' => array('Excite', 'search', 'search.gw?search={k}', 'SHIFT_JIS'), + 'www.excite.co.jp' => array('Excite', 'search', 'search.gw?search={k}', array('UTF-8', 'EUC-JP', 'MS932')), // Exalead 'www.exalead.fr' => array('Exalead', 'q', 'search/results?q={k}'), @@ -416,11 +419,11 @@ 'www.gomeo.com' => array('Gomeo', array('Keywords', '/\/search\/([^\/]+)/'), '/search/{k}'), // goo - 'search.goo.ne.jp' => array('goo', 'MT', 'web.jsp?MT={k}'), - 'ocnsearch.goo.ne.jp' => array('goo'), + 'search.goo.ne.jp' => array('goo', 'MT', 'web.jsp?MT={k}', array('UTF-8', 'EUC-JP', 'MS932')), + 'ocnsearch.goo.ne.jp' => array('goo', '', '', array('UTF-8', 'EUC-JP', 'MS932')), // Google - 'google.com' => array('Google', 'q', 'search?q={k}'), + 'google.com' => array('Google', 'q', 'search?q={k}', array('UTF-8', 'EUC-JP', 'MS932')), 'google.{}' => array('Google'), 'www2.google.com' => array('Google'), 'ipv6.google.com' => array('Google'), @@ -471,12 +474,12 @@ '{}.wow.com' => array('Google'), 'search.leonardo.it' => array('Google'), 'www.optuszoo.com.au' => array('Google'), - 'search.dolphin-browser.jp' => array('Google'), - 'netlavis.azione.jp' => array('Google'), - 'search.nan.so' => array('Google'), - 'cgi2.nintendo.co.jp' => array('Google', 'gsc.q'), - 'search.smt.docomo.ne.jp' => array('Google', 'MT'), - 'image.search.smt.docomo.ne.jp' => array('Google', 'MT'), + 'search.dolphin-browser.jp' => array('Google', '', '', array('UTF-8', 'EUC-JP', 'MS932')), + 'netlavis.azione.jp' => array('Google', '', '', array('UTF-8', 'EUC-JP', 'MS932')), + 'search.nan.so' => array('Google', '', '', array('UTF-8', 'EUC-JP', 'MS932')), + 'cgi2.nintendo.co.jp' => array('Google', 'gsc.q', '', array('UTF-8', 'EUC-JP', 'MS932')), + 'search.smt.docomo.ne.jp' => array('Google', 'MT', '', array('UTF-8', 'EUC-JP', 'MS932')), + 'image.search.smt.docomo.ne.jp' => array('Google', 'MT', '', array('UTF-8', 'EUC-JP', 'MS932')), 'gfsoso.com' => array('Google', 'q'), 'searches.safehomepage.com' => array('Google', 'q'), 'searches.f-secure.com' => array('Google', 'query', 'search?query={k}'), @@ -492,9 +495,9 @@ 'blogsearch.google.{}' => array('Google Blogsearch'), // Google Custom Search - 'google.com/cse' => array('Google Custom Search', array('q', 'query')), + 'google.com/cse' => array('Google Custom Search', array('q', 'query'), '', array('UTF-8', 'EUC-JP', 'MS932')), 'google.{}/cse' => array('Google Custom Search'), - 'google.com/custom' => array('Google Custom Search'), + 'google.com/custom' => array('Google Custom Search', '', '', array('UTF-8', 'EUC-JP', 'MS932')), 'google.{}/custom' => array('Google Custom Search'), // Google Translation @@ -754,9 +757,9 @@ 'www.neti.ee' => array('Neti', 'query', 'cgi-bin/otsing?query={k}', 'iso-8859-1'), // Nifty - 'search.nifty.com' => array('Nifty', array('q', 'Text'), 'websearch/search?q={k}'), - 'search.azby.fmworld.net' => array('Nifty'), - 'videosearch.nifty.com' => array('Nifty Videos', 'kw', 'search?kw={k}'), + 'search.nifty.com' => array('Nifty', array('q', 'Text'), 'websearch/search?q={k}', array('UTF-8', 'EUC-JP', 'MS932')), + 'search.azby.fmworld.net' => array('Nifty', '', '', array('UTF-8', 'EUC-JP', 'MS932')), + 'videosearch.nifty.com' => array('Nifty Videos', 'kw', 'search?kw={k}', array('UTF-8', 'EUC-JP', 'MS932')), // Nigma 'nigma.ru' => array('Nigma', 's', 'index.php?s={k}'), @@ -808,7 +811,7 @@ 'www.qwant.com' => array('Qwant', 'q'), // Rakuten - 'websearch.rakuten.co.jp' => array('Rakuten', 'qt', 'WebIS?qt={k}'), + 'websearch.rakuten.co.jp' => array('Rakuten', 'qt', 'WebIS?qt={k}', array('UTF-8', 'EUC-JP', 'MS932')), // Rambler 'nova.rambler.ru' => array('Rambler', array('query', 'words'), 'search?query={k}'), @@ -877,11 +880,11 @@ 'search.snap.do' => array('Snap.do', 'q', '?q={k}'), // SeeSaa - 'search.seesaa.jp' => array('SeeSaa', '/\/([^\/]+)\/index\.html/', '{k}/index.html'), + 'search.seesaa.jp' => array('SeeSaa', '/\/([^\/]+)\/index\.html/', '{k}/index.html', array('UTF-8', 'EUC-JP', 'MS932')), // So-net - 'www.so-net.ne.jp' => array('So-net', 'query', 'search/web/?query={k}'), - 'video.so-net.ne.jp' => array('So-net Videos', 'kw', 'search/?kw={k}'), + 'www.so-net.ne.jp' => array('So-net', 'query', 'search/web/?query={k}', array('UTF-8', 'EUC-JP', 'MS932')), + 'video.so-net.ne.jp' => array('So-net Videos', 'kw', 'search/?kw={k}', array('UTF-8', 'EUC-JP', 'MS932')), // Sogou 'www.sogou.com' => array('Sogou', 'query', 'web?query={k}', 'gb2312'), @@ -1029,7 +1032,7 @@ 'szukaj.wp.pl' => array('Wirtualna Polska', 'szukaj', 'http://szukaj.wp.pl/szukaj.html?szukaj={k}'), // Woopie - 'www.woopie.jp' => array('Woopie', 'kw', 'search?kw={k}'), + 'www.woopie.jp' => array('Woopie', 'kw', 'search?kw={k}', array('UTF-8', 'EUC-JP', 'MS932')), // WWW 'search.www.ee' => array('www värav', 'query'), @@ -1038,11 +1041,11 @@ 'www.x-recherche.com' => array('X-Recherche', 'MOTS', 'cgi-bin/websearch?MOTS={k}'), // Yahoo! Japan - 'search.yahoo.co.jp' => array('Yahoo! Japan', array('p', 'vp'), 'search?p={k}'), - 'jp.hao123.com' => array('Yahoo! Japan', 'query'), - 'home.kingsoft.jp' => array('Yahoo! Japan', 'keyword'), - 'video.search.yahoo.co.jp' => array('Yahoo! Japan Videos', 'p', 'search?p={k}'), - 'image.search.yahoo.co.jp' => array('Yahoo! Japan Images', 'p', 'search?p={k}'), + 'search.yahoo.co.jp' => array('Yahoo! Japan', array('p', 'vp'), 'search?p={k}', array('UTF-8', 'EUC-JP', 'MS932')), + 'jp.hao123.com' => array('Yahoo! Japan', 'query', '', array('UTF-8', 'EUC-JP', 'MS932')), + 'home.kingsoft.jp' => array('Yahoo! Japan', 'keyword', '', array('UTF-8', 'EUC-JP', 'MS932')), + 'video.search.yahoo.co.jp' => array('Yahoo! Japan Videos', 'p', 'search?p={k}', array('UTF-8', 'EUC-JP', 'MS932')), + 'image.search.yahoo.co.jp' => array('Yahoo! Japan Images', 'p', 'search?p={k}', array('UTF-8', 'EUC-JP', 'MS932')), // Yahoo 'search.yahoo.com' => array('Yahoo!', array('p', 'q'), 'search?p={k}'), diff -urN piwik.orig/core/Tracker/PageUrl.php piwik/core/Tracker/PageUrl.php --- piwik.orig/core/Tracker/PageUrl.php 2015-06-08 12:35:41.000000000 +0900 +++ piwik/core/Tracker/PageUrl.php 2015-11-12 16:30:35.541005759 +0900 @@ -12,6 +12,7 @@ use Piwik\Common; use Piwik\Config; use Piwik\Piwik; +use Piwik\Log; use Piwik\UrlHelper; class PageUrl @@ -208,6 +209,33 @@ } /** + * Detect UTF-8, which mysql understands. + * + * @param string $string The string being detected. + * @return boolean + */ + private static function isValidUtf8($string) + { + // Reference: http://tools.ietf.org/html/rfc3629 + $regex = '/(?:' + . '[\x00-\x7f]|' // U+0000 - U+007F + . '[\xc2-\xdf][\x80-\xbf]|' // U+0080 - U+07FF + . '\xe0[\xa0-\xbf][\x80-\xbf]|' // U+0800 - U+0FFF + . '[\xe1-\xec][\x80-\xbf][\x80-\xbf]|' // U+1000 - U+CFFF + . '\xed[\x80-\x9f][\x80-\xbf]|' // U+D000 - U+D7FF + . '[\xee-\xef][\x80-\xbf][\x80-\xbf]|' // U+E000 - U+FFFF + # . '\xf0[\x90-\xbf][\x80-\xbf][\x80-\xbf]|' // U+10000 - U+3FFFF perhaps, mysql does not understand. + # . '[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]|' // U+40000 - U+FFFFF perhaps, mysql does not understand. + # . '\xf4[\x80-\x8f][\x80-\xbf][\x80-\xbf]|' // U+100000 - U+10FFFF perhaps, mysql does not understand. + . ')/'; + $result = preg_replace($regex, '', $string); + if ($result !== '') { + return FALSE; + } + return TRUE; + } + + /** * Clean up string contents (filter, truncate, ...) * * @param string $string Dirty string @@ -215,11 +243,18 @@ */ public static function cleanupString($string) { + if (!PageUrl::isValidUtf8($string)) { + $string = ''; + } $string = trim($string); $string = str_replace(array("\n", "\r", "\0"), '', $string); - + Log::debug("cleanupString = '$string'"); $limit = Config::getInstance()->Tracker['page_maximum_length']; - $clean = substr($string, 0, $limit); + if (function_exists('mb_substr')) { + $clean = mb_substr($string, 0, $limit, 'utf-8'); + } else { + $clean = substr($string, 0, $limit); + } return $clean; } diff -urN piwik.orig/core/UrlHelper.php piwik/core/UrlHelper.php --- piwik.orig/core/UrlHelper.php 2015-09-11 12:46:43.000000000 +0900 +++ piwik/core/UrlHelper.php 2015-11-12 16:30:35.578005754 +0900 @@ -458,34 +458,32 @@ } if (!empty($key)) { - if (function_exists('iconv') - && isset($searchEngines[$referrerHost][3]) - ) { + if (function_exists('iconv')) { + $charsets = !empty($searchEngines[$referrerHost][3]) ? $searchEngines[$referrerHost][3] : (!empty($searchEngines['default'][3]) ? $searchEngines['default'][3] : array('UTF-8', 'EUC-JP', 'MS932')); // accepts string, array, or comma-separated list string in preferred order - $charsets = $searchEngines[$referrerHost][3]; if (!is_array($charsets)) { $charsets = explode(',', $charsets); } - - if (!empty($charsets)) { - $charset = $charsets[0]; - if (count($charsets) > 1 - && function_exists('mb_detect_encoding') - ) { - $charset = mb_detect_encoding($key, $charsets); - if ($charset === false) { - $charset = $charsets[0]; - } + $charset = $charsets[0]; + if (count($charsets) > 1 + && function_exists('mb_detect_encoding') + ) { + $charset = mb_detect_encoding($key, $charsets); + if ($charset === false) { + $charset = $charsets[0]; } + } - $newkey = @iconv($charset, 'UTF-8//IGNORE', $key); - if (!empty($newkey)) { - $key = $newkey; - } + $newkey = @iconv($charset, 'UTF-8//IGNORE', $key); + if (!empty($newkey)) { + $key = $newkey; + } else { + $key = "mb_detect_encoding can't recognize character encoding."; } } $key = Common::mb_strtolower($key); + Log::debug("searchengine, detected charcode, key = '$searchEngineName', '$charset', '$key'"); } return array( diff -urN piwik.orig/plugins/Actions/Actions/ActionSiteSearch.php piwik/plugins/Actions/Actions/ActionSiteSearch.php --- piwik.orig/plugins/Actions/Actions/ActionSiteSearch.php 2014-12-16 11:59:50.000000000 +0900 +++ piwik/plugins/Actions/Actions/ActionSiteSearch.php 2015-11-12 16:30:35.581005753 +0900 @@ -180,19 +180,42 @@ if (is_array($actionName)) { $actionName = reset($actionName); } + Common::printDebug("actionname: " . $actionName); + $actionName = trim(urldecode($actionName)); + $searchEngines = Common::getSearchEngineUrls(); + if (!empty($actionName)) { + if (function_exists('iconv')) { + // accepts string, array, or comma-separated list string in preferred order + $charsets = !empty($searchEngines['default'][3]) ? $searchEngines['default'][3] : array('UTF-8', 'EUC-JP', 'MS932'); + if (!is_array($charsets)) { + $charsets = explode(',', $charsets); + } + $charset = $charsets[0]; + if (count($charsets) > 1 + && function_exists('mb_detect_encoding') + ) { + $charset = mb_detect_encoding($actionName, $charsets); + if ($charset === false) { + $charset = $charsets[0]; + } + } - $actionName = PageUrl::urldecodeValidUtf8($actionName); - $actionName = trim($actionName); + $newactionName = @iconv($charset, 'UTF-8//IGNORE', $actionName); + if (!empty($newactionName)) { + $actionName = $newactionName; + } else { + $actionName = ''; + } + Common::printDebug("detected charcode, actionname = '$charset', '$actionName'"); + } + } if (empty($actionName)) { return false; } - if (is_array($categoryName)) { $categoryName = reset($categoryName); } - $categoryName = PageUrl::urldecodeValidUtf8($categoryName); - $categoryName = trim($categoryName); - + $categoryName = trim(urldecode($categoryName)); return array($url, $actionName, $categoryName, $count); } diff -urN piwik.orig/plugins/Referrers/Columns/Keyword.php piwik/plugins/Referrers/Columns/Keyword.php --- piwik.orig/plugins/Referrers/Columns/Keyword.php 2014-12-16 11:59:50.000000000 +0900 +++ piwik/plugins/Referrers/Columns/Keyword.php 2015-11-12 16:30:35.584005761 +0900 @@ -44,7 +44,11 @@ $information = $this->getReferrerInformationFromRequest($request); if (!empty($information['referer_keyword'])) { - return substr($information['referer_keyword'], 0, 255); + if (function_exists('mb_substr')) { + return mb_substr($information['referer_keyword'], 0, 255, 'UTF-8'); + } else { + return substr($information['referer_keyword'], 0, 255); + } } return $information['referer_keyword']; diff -urN piwik.orig/plugins/Referrers/Columns/ReferrerName.php piwik/plugins/Referrers/Columns/ReferrerName.php --- piwik.orig/plugins/Referrers/Columns/ReferrerName.php 2014-12-16 11:59:50.000000000 +0900 +++ piwik/plugins/Referrers/Columns/ReferrerName.php 2015-11-12 16:30:35.586005758 +0900 @@ -38,10 +38,12 @@ $information = $this->getReferrerInformationFromRequest($request); if (!empty($information['referer_name'])) { - - return substr($information['referer_name'], 0, 70); + if (function_exists('mb_substr')) { + return mb_substr($information['referer_name'], 0, 70, 'UTF-8'); + } else { + return substr($information['referer_name'], 0, 70); + } } - return $information['referer_name']; }