diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ab0a09a..1257e3d2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
### Fixed
* [#2111](https://github.com/shlinkio/shlink/issues/2111) Fix typo in OAS docs examples where redirect rules with `query-param` condition type were defined as `query`.
+* [#2129](https://github.com/shlinkio/shlink/issues/2129) Fix error when resolving title for sites not using UTF-8 charset (detected with Japanese charsets).
## [4.1.0] - 2024-04-14
diff --git a/composer.json b/composer.json
index 82a4a93a..c94cbdfa 100644
--- a/composer.json
+++ b/composer.json
@@ -16,6 +16,7 @@
"ext-curl": "*",
"ext-gd": "*",
"ext-json": "*",
+ "ext-mbstring": "*",
"ext-pdo": "*",
"akrabat/ip-address-middleware": "^2.1",
"cakephp/chronos": "^3.0.2",
diff --git a/config/constants.php b/config/constants.php
index 51ee0476..20c64f19 100644
--- a/config/constants.php
+++ b/config/constants.php
@@ -12,7 +12,6 @@ const MIN_SHORT_CODES_LENGTH = 4;
const DEFAULT_REDIRECT_STATUS_CODE = RedirectStatus::STATUS_302;
const DEFAULT_REDIRECT_CACHE_LIFETIME = 30;
const LOCAL_LOCK_FACTORY = 'Shlinkio\Shlink\LocalLockFactory';
-const TITLE_TAG_VALUE = '/
]*>(.*?)<\/title>/i'; // Matches the value inside a html title tag
const LOOSE_URI_MATCHER = '/(.+)\:(.+)/i'; // Matches anything starting with a schema.
const DEFAULT_QR_CODE_SIZE = 300;
const DEFAULT_QR_CODE_MARGIN = 0;
diff --git a/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php b/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php
index e91b1ff1..3f9b6225 100644
--- a/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php
+++ b/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php
@@ -12,20 +12,24 @@ use Shlinkio\Shlink\Core\Options\UrlShortenerOptions;
use Throwable;
use function html_entity_decode;
+use function mb_convert_encoding;
use function preg_match;
use function str_contains;
use function str_starts_with;
use function strtolower;
use function trim;
-use const Shlinkio\Shlink\TITLE_TAG_VALUE;
-
readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionHelperInterface
{
public const MAX_REDIRECTS = 15;
public const CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
. 'Chrome/121.0.0.0 Safari/537.36';
+ // Matches the value inside a html title tag
+ private const TITLE_TAG_VALUE = '/]*>(.*?)<\/title>/i';
+ // Matches the charset inside a Content-Type header
+ private const CHARSET_VALUE = '/charset=([^;]+)/i';
+
public function __construct(
private ClientInterface $httpClient,
private UrlShortenerOptions $options,
@@ -53,7 +57,7 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
return $data;
}
- $title = $this->tryToResolveTitle($response);
+ $title = $this->tryToResolveTitle($response, $contentType);
return $title !== null ? $data->withResolvedTitle($title) : $data;
}
@@ -76,7 +80,7 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
}
}
- private function tryToResolveTitle(ResponseInterface $response): ?string
+ private function tryToResolveTitle(ResponseInterface $response, string $contentType): ?string
{
$collectedBody = '';
$body = $response->getBody();
@@ -84,12 +88,19 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
while (! str_contains($collectedBody, '') && ! $body->eof()) {
$collectedBody .= $body->read(1024);
}
- preg_match(TITLE_TAG_VALUE, $collectedBody, $matches);
- return isset($matches[1]) ? $this->normalizeTitle($matches[1]) : null;
- }
- private function normalizeTitle(string $title): string
- {
+ // Try to match the title from the tag
+ preg_match(self::TITLE_TAG_VALUE, $collectedBody, $titleMatches);
+ if (! isset($titleMatches[1])) {
+ return null;
+ }
+
+ // Get the page's charset from Content-Type header
+ preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches);
+
+ $title = isset($charsetMatches[1])
+ ? mb_convert_encoding($titleMatches[1], 'utf8', $charsetMatches[1])
+ : $titleMatches[1];
return html_entity_decode(trim($title));
}
}
diff --git a/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php b/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php
index 06b47f8c..92fac8eb 100644
--- a/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php
+++ b/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php
@@ -12,6 +12,7 @@ use Laminas\Diactoros\Response;
use Laminas\Diactoros\Response\JsonResponse;
use Laminas\Diactoros\Stream;
use PHPUnit\Framework\Attributes\Test;
+use PHPUnit\Framework\Attributes\TestWith;
use PHPUnit\Framework\MockObject\Builder\InvocationMocker;
use PHPUnit\Framework\MockObject\MockObject;
use PHPUnit\Framework\TestCase;
@@ -89,10 +90,12 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
}
#[Test]
- public function titleIsUpdatedWhenItCanBeResolvedFromResponse(): void
+ #[TestWith(['TEXT/html; charset=utf-8'], name: 'charset')]
+ #[TestWith(['TEXT/html'], name: 'no charset')]
+ public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType): void
{
$data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
- $this->expectRequestToBeCalled()->willReturn($this->respWithTitle());
+ $this->expectRequestToBeCalled()->willReturn($this->respWithTitle($contentType));
$result = $this->helper(autoResolveTitles: true)->processTitle($data);
@@ -122,10 +125,10 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
return new Response($body, 200, ['Content-Type' => 'text/html']);
}
- private function respWithTitle(): Response
+ private function respWithTitle(string $contentType): Response
{
$body = $this->createStreamWithContent(' Resolved "title" ');
- return new Response($body, 200, ['Content-Type' => 'TEXT/html; charset=utf-8']);
+ return new Response($body, 200, ['Content-Type' => $contentType]);
}
private function createStreamWithContent(string $content): Stream