From e9fe1ac5d4dc7af84807ef1245c00b9c436d9444 Mon Sep 17 00:00:00 2001 From: Alejandro Celaya Date: Tue, 28 Jan 2025 10:04:30 +0100 Subject: [PATCH] Fix error when creating short URL for page with unsupported encoding --- CHANGELOG.md | 1 + module/Core/config/dependencies.config.php | 1 + .../Helper/ShortUrlTitleResolutionHelper.php | 61 +++++++++++++++--- .../ShortUrlTitleResolutionHelperTest.php | 62 +++++++++++++++++-- 4 files changed, 112 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d09a1851..ed878e3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this ### Fixed * [#2341](https://github.com/shlinkio/shlink/issues/2341) Ensure all asynchronous jobs that interact with the database do not leave idle connections open. +* [#2334](https://github.com/shlinkio/shlink/issues/2334) Improve how page titles are encoded to UTF-8, falling back from mbstring to iconv if available, and ultimately using the original title in case of error, but never causing the short URL creation to fail. # [4.4.0] - 2024-12-27 diff --git a/module/Core/config/dependencies.config.php b/module/Core/config/dependencies.config.php index eda556e9..1cd1e961 100644 --- a/module/Core/config/dependencies.config.php +++ b/module/Core/config/dependencies.config.php @@ -227,6 +227,7 @@ return [ ShortUrl\Helper\ShortUrlTitleResolutionHelper::class => [ 'httpClient', Config\Options\UrlShortenerOptions::class, + 'Logger_Shlink', ], ShortUrl\Helper\ShortUrlRedirectionBuilder::class => [ Config\Options\TrackingOptions::class, diff --git a/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php b/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php index 5af78345..366e18e2 100644 --- a/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php +++ b/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php @@ -4,14 +4,19 @@ declare(strict_types=1); namespace Shlinkio\Shlink\Core\ShortUrl\Helper; +use Closure; use Fig\Http\Message\RequestMethodInterface; use GuzzleHttp\ClientInterface; use GuzzleHttp\RequestOptions; +use Laminas\Stdlib\ErrorHandler; use Psr\Http\Message\ResponseInterface; +use Psr\Log\LoggerInterface; use Shlinkio\Shlink\Core\Config\Options\UrlShortenerOptions; use Throwable; +use function function_exists; use function html_entity_decode; +use function iconv; use function mb_convert_encoding; use function preg_match; use function str_contains; @@ -30,9 +35,14 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH // Matches the charset inside a Content-Type header private const string CHARSET_VALUE = '/charset=([^;]+)/i'; + /** + * @param (Closure(): bool)|null $isIconvInstalled + */ public function __construct( private ClientInterface $httpClient, private UrlShortenerOptions $options, + private LoggerInterface $logger, + private Closure|null $isIconvInstalled = null, ) { } @@ -58,7 +68,7 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH } $title = $this->tryToResolveTitle($response, $contentType); - return $title !== null ? $data->withResolvedTitle($title) : $data; + return $title !== null ? $data->withResolvedTitle(html_entity_decode(trim($title))) : $data; } private function fetchUrl(string $url): ResponseInterface|null @@ -84,6 +94,7 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH { $collectedBody = ''; $body = $response->getBody(); + // With streaming enabled, we can walk the body until the tag is found, and then stop while (! str_contains($collectedBody, '') && ! $body->eof()) { $collectedBody .= $body->read(1024); @@ -95,12 +106,48 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH return null; } - // Get the page's charset from Content-Type header - preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches); + $titleInOriginalEncoding = $titleMatches[1]; - $title = isset($charsetMatches[1]) - ? mb_convert_encoding($titleMatches[1], 'utf8', $charsetMatches[1]) - : $titleMatches[1]; - return html_entity_decode(trim($title)); + // Get the page's charset from Content-Type header, or return title as is if not found + preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches); + if (! isset($charsetMatches[1])) { + return $titleInOriginalEncoding; + } + + $pageCharset = $charsetMatches[1]; + return $this->encodeToUtf8WithMbString($titleInOriginalEncoding, $pageCharset) + ?? $this->encodeToUtf8WithIconv($titleInOriginalEncoding, $pageCharset) + ?? $titleInOriginalEncoding; + } + + private function encodeToUtf8WithMbString(string $titleInOriginalEncoding, string $pageCharset): string|null + { + try { + return mb_convert_encoding($titleInOriginalEncoding, 'utf-8', $pageCharset); + } catch (Throwable $e) { + $this->logger->warning('It was impossible to encode page title in UTF-8 with mb_convert_encoding. {e}', [ + 'e' => $e, + ]); + return null; + } + } + + private function encodeToUtf8WithIconv(string $titleInOriginalEncoding, string $pageCharset): string|null + { + $isIconvInstalled = ($this->isIconvInstalled ?? fn () => function_exists('iconv'))(); + if (! $isIconvInstalled) { + $this->logger->warning('Missing iconv extension. Skipping title encoding'); + return null; + } + + try { + ErrorHandler::start(); + $title = iconv($pageCharset, 'utf-8', $titleInOriginalEncoding); + ErrorHandler::stop(throw: true); + return $title ?: null; + } catch (Throwable $e) { + $this->logger->warning('It was impossible to encode page title in UTF-8 with iconv. {e}', ['e' => $e]); + return null; + } } } diff --git a/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php b/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php index b11a28a3..787ca294 100644 --- a/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php +++ b/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php @@ -16,6 +16,7 @@ use PHPUnit\Framework\Attributes\TestWith; use PHPUnit\Framework\MockObject\Builder\InvocationMocker; use PHPUnit\Framework\MockObject\MockObject; use PHPUnit\Framework\TestCase; +use Psr\Log\LoggerInterface; use Shlinkio\Shlink\Core\Config\Options\UrlShortenerOptions; use Shlinkio\Shlink\Core\ShortUrl\Helper\ShortUrlTitleResolutionHelper; use Shlinkio\Shlink\Core\ShortUrl\Model\ShortUrlCreation; @@ -25,10 +26,12 @@ class ShortUrlTitleResolutionHelperTest extends TestCase private const string LONG_URL = 'http://foobar.com/12345/hello?foo=bar'; private MockObject & ClientInterface $httpClient; + private MockObject & LoggerInterface $logger; protected function setUp(): void { $this->httpClient = $this->createMock(ClientInterface::class); + $this->logger = $this->createMock(LoggerInterface::class); } #[Test] @@ -90,14 +93,59 @@ class ShortUrlTitleResolutionHelperTest extends TestCase } #[Test] - #[TestWith(['TEXT/html; charset=utf-8'], 'charset')] - #[TestWith(['TEXT/html'], 'no charset')] - public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType): void + #[TestWith(['TEXT/html', false], 'no charset')] + #[TestWith(['TEXT/html; charset=utf-8', false], 'mbstring-supported charset')] + #[TestWith(['TEXT/html; charset=Windows-1255', true], 'mbstring-unsupported charset')] + public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType, bool $expectsWarning): void { - $data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]); $this->expectRequestToBeCalled()->willReturn($this->respWithTitle($contentType)); + if ($expectsWarning) { + $this->logger->expects($this->once())->method('warning')->with( + 'It was impossible to encode page title in UTF-8 with mb_convert_encoding. {e}', + $this->isArray(), + ); + } else { + $this->logger->expects($this->never())->method('warning'); + } - $result = $this->helper(autoResolveTitles: true)->processTitle($data); + $data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]); + $result = $this->helper(autoResolveTitles: true, iconvEnabled: true)->processTitle($data); + + self::assertNotSame($data, $result); + self::assertEquals('Resolved "title"', $result->title); + } + + #[Test] + #[TestWith([ + 'contentType' => 'text/html; charset=Windows-1255', + 'iconvEnabled' => false, + 'expectedSecondMessage' => 'Missing iconv extension. Skipping title encoding', + ])] + #[TestWith([ + 'contentType' => 'text/html; charset=foo', + 'iconvEnabled' => true, + 'expectedSecondMessage' => 'It was impossible to encode page title in UTF-8 with iconv. {e}', + ])] + public function warningsLoggedWhenTitleCannotBeEncodedToUtf8( + string $contentType, + bool $iconvEnabled, + string $expectedSecondMessage, + ): void { + $this->expectRequestToBeCalled()->willReturn($this->respWithTitle($contentType)); + $callCount = 0; + $this->logger->expects($this->exactly(2))->method('warning')->with($this->callback( + function (string $message) use (&$callCount, $expectedSecondMessage): bool { + $callCount++; + if ($callCount === 1) { + return $message === 'It was impossible to encode page title in UTF-8 with mb_convert_encoding. {e}'; + } + + return $message === $expectedSecondMessage; + }, + )); + + $data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]); + $result = $this->helper(autoResolveTitles: true, iconvEnabled: $iconvEnabled)->processTitle($data); self::assertNotSame($data, $result); self::assertEquals('Resolved "title"', $result->title); @@ -143,11 +191,13 @@ class ShortUrlTitleResolutionHelperTest extends TestCase return $body; } - private function helper(bool $autoResolveTitles = false): ShortUrlTitleResolutionHelper + private function helper(bool $autoResolveTitles = false, bool $iconvEnabled = false): ShortUrlTitleResolutionHelper { return new ShortUrlTitleResolutionHelper( $this->httpClient, new UrlShortenerOptions(autoResolveTitles: $autoResolveTitles), + $this->logger, + fn () => $iconvEnabled, ); } }