From 80e9c2452b34555537a1ac7c66469a9dc01d71b2 Mon Sep 17 00:00:00 2001 From: Alejandro Celaya Date: Wed, 22 May 2024 18:09:40 +0200 Subject: [PATCH] Convert encoding of resolved titles based on page encoding --- CHANGELOG.md | 1 + composer.json | 1 + config/constants.php | 1 - .../Helper/ShortUrlTitleResolutionHelper.php | 29 +++++++++++++------ .../ShortUrlTitleResolutionHelperTest.php | 11 ++++--- 5 files changed, 29 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ab0a09a..1257e3d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this ### Fixed * [#2111](https://github.com/shlinkio/shlink/issues/2111) Fix typo in OAS docs examples where redirect rules with `query-param` condition type were defined as `query`. +* [#2129](https://github.com/shlinkio/shlink/issues/2129) Fix error when resolving title for sites not using UTF-8 charset (detected with Japanese charsets). ## [4.1.0] - 2024-04-14 diff --git a/composer.json b/composer.json index 82a4a93a..c94cbdfa 100644 --- a/composer.json +++ b/composer.json @@ -16,6 +16,7 @@ "ext-curl": "*", "ext-gd": "*", "ext-json": "*", + "ext-mbstring": "*", "ext-pdo": "*", "akrabat/ip-address-middleware": "^2.1", "cakephp/chronos": "^3.0.2", diff --git a/config/constants.php b/config/constants.php index 51ee0476..20c64f19 100644 --- a/config/constants.php +++ b/config/constants.php @@ -12,7 +12,6 @@ const MIN_SHORT_CODES_LENGTH = 4; const DEFAULT_REDIRECT_STATUS_CODE = RedirectStatus::STATUS_302; const DEFAULT_REDIRECT_CACHE_LIFETIME = 30; const LOCAL_LOCK_FACTORY = 'Shlinkio\Shlink\LocalLockFactory'; -const TITLE_TAG_VALUE = '/]*>(.*?)<\/title>/i'; // Matches the value inside a html title tag const LOOSE_URI_MATCHER = '/(.+)\:(.+)/i'; // Matches anything starting with a schema. const DEFAULT_QR_CODE_SIZE = 300; const DEFAULT_QR_CODE_MARGIN = 0; diff --git a/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php b/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php index e91b1ff1..3f9b6225 100644 --- a/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php +++ b/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php @@ -12,20 +12,24 @@ use Shlinkio\Shlink\Core\Options\UrlShortenerOptions; use Throwable; use function html_entity_decode; +use function mb_convert_encoding; use function preg_match; use function str_contains; use function str_starts_with; use function strtolower; use function trim; -use const Shlinkio\Shlink\TITLE_TAG_VALUE; - readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionHelperInterface { public const MAX_REDIRECTS = 15; public const CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' . 'Chrome/121.0.0.0 Safari/537.36'; + // Matches the value inside a html title tag + private const TITLE_TAG_VALUE = '/]*>(.*?)<\/title>/i'; + // Matches the charset inside a Content-Type header + private const CHARSET_VALUE = '/charset=([^;]+)/i'; + public function __construct( private ClientInterface $httpClient, private UrlShortenerOptions $options, @@ -53,7 +57,7 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH return $data; } - $title = $this->tryToResolveTitle($response); + $title = $this->tryToResolveTitle($response, $contentType); return $title !== null ? $data->withResolvedTitle($title) : $data; } @@ -76,7 +80,7 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH } } - private function tryToResolveTitle(ResponseInterface $response): ?string + private function tryToResolveTitle(ResponseInterface $response, string $contentType): ?string { $collectedBody = ''; $body = $response->getBody(); @@ -84,12 +88,19 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH while (! str_contains($collectedBody, '') && ! $body->eof()) { $collectedBody .= $body->read(1024); } - preg_match(TITLE_TAG_VALUE, $collectedBody, $matches); - return isset($matches[1]) ? $this->normalizeTitle($matches[1]) : null; - } - private function normalizeTitle(string $title): string - { + // Try to match the title from the tag + preg_match(self::TITLE_TAG_VALUE, $collectedBody, $titleMatches); + if (! isset($titleMatches[1])) { + return null; + } + + // Get the page's charset from Content-Type header + preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches); + + $title = isset($charsetMatches[1]) + ? mb_convert_encoding($titleMatches[1], 'utf8', $charsetMatches[1]) + : $titleMatches[1]; return html_entity_decode(trim($title)); } } diff --git a/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php b/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php index 06b47f8c..92fac8eb 100644 --- a/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php +++ b/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php @@ -12,6 +12,7 @@ use Laminas\Diactoros\Response; use Laminas\Diactoros\Response\JsonResponse; use Laminas\Diactoros\Stream; use PHPUnit\Framework\Attributes\Test; +use PHPUnit\Framework\Attributes\TestWith; use PHPUnit\Framework\MockObject\Builder\InvocationMocker; use PHPUnit\Framework\MockObject\MockObject; use PHPUnit\Framework\TestCase; @@ -89,10 +90,12 @@ class ShortUrlTitleResolutionHelperTest extends TestCase } #[Test] - public function titleIsUpdatedWhenItCanBeResolvedFromResponse(): void + #[TestWith(['TEXT/html; charset=utf-8'], name: 'charset')] + #[TestWith(['TEXT/html'], name: 'no charset')] + public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType): void { $data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]); - $this->expectRequestToBeCalled()->willReturn($this->respWithTitle()); + $this->expectRequestToBeCalled()->willReturn($this->respWithTitle($contentType)); $result = $this->helper(autoResolveTitles: true)->processTitle($data); @@ -122,10 +125,10 @@ class ShortUrlTitleResolutionHelperTest extends TestCase return new Response($body, 200, ['Content-Type' => 'text/html']); } - private function respWithTitle(): Response + private function respWithTitle(string $contentType): Response { $body = $this->createStreamWithContent('<title data-foo="bar"> Resolved "title" '); - return new Response($body, 200, ['Content-Type' => 'TEXT/html; charset=utf-8']); + return new Response($body, 200, ['Content-Type' => $contentType]); } private function createStreamWithContent(string $content): Stream