mirror of
https://github.com/shlinkio/shlink.git
synced 2024-11-22 08:56:42 -06:00
Merge pull request #2135 from acelaya-forks/feature/non-utf8-titles
Convert encoding of resolved titles based on page encoding
This commit is contained in:
parent
c855f011d1
commit
fb4fecf411
@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
|
|||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
* [#2111](https://github.com/shlinkio/shlink/issues/2111) Fix typo in OAS docs examples where redirect rules with `query-param` condition type were defined as `query`.
|
* [#2111](https://github.com/shlinkio/shlink/issues/2111) Fix typo in OAS docs examples where redirect rules with `query-param` condition type were defined as `query`.
|
||||||
|
* [#2129](https://github.com/shlinkio/shlink/issues/2129) Fix error when resolving title for sites not using UTF-8 charset (detected with Japanese charsets).
|
||||||
|
|
||||||
|
|
||||||
## [4.1.0] - 2024-04-14
|
## [4.1.0] - 2024-04-14
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
"ext-curl": "*",
|
"ext-curl": "*",
|
||||||
"ext-gd": "*",
|
"ext-gd": "*",
|
||||||
"ext-json": "*",
|
"ext-json": "*",
|
||||||
|
"ext-mbstring": "*",
|
||||||
"ext-pdo": "*",
|
"ext-pdo": "*",
|
||||||
"akrabat/ip-address-middleware": "^2.1",
|
"akrabat/ip-address-middleware": "^2.1",
|
||||||
"cakephp/chronos": "^3.0.2",
|
"cakephp/chronos": "^3.0.2",
|
||||||
|
@ -12,7 +12,6 @@ const MIN_SHORT_CODES_LENGTH = 4;
|
|||||||
const DEFAULT_REDIRECT_STATUS_CODE = RedirectStatus::STATUS_302;
|
const DEFAULT_REDIRECT_STATUS_CODE = RedirectStatus::STATUS_302;
|
||||||
const DEFAULT_REDIRECT_CACHE_LIFETIME = 30;
|
const DEFAULT_REDIRECT_CACHE_LIFETIME = 30;
|
||||||
const LOCAL_LOCK_FACTORY = 'Shlinkio\Shlink\LocalLockFactory';
|
const LOCAL_LOCK_FACTORY = 'Shlinkio\Shlink\LocalLockFactory';
|
||||||
const TITLE_TAG_VALUE = '/<title[^>]*>(.*?)<\/title>/i'; // Matches the value inside a html title tag
|
|
||||||
const LOOSE_URI_MATCHER = '/(.+)\:(.+)/i'; // Matches anything starting with a schema.
|
const LOOSE_URI_MATCHER = '/(.+)\:(.+)/i'; // Matches anything starting with a schema.
|
||||||
const DEFAULT_QR_CODE_SIZE = 300;
|
const DEFAULT_QR_CODE_SIZE = 300;
|
||||||
const DEFAULT_QR_CODE_MARGIN = 0;
|
const DEFAULT_QR_CODE_MARGIN = 0;
|
||||||
|
@ -12,20 +12,24 @@ use Shlinkio\Shlink\Core\Options\UrlShortenerOptions;
|
|||||||
use Throwable;
|
use Throwable;
|
||||||
|
|
||||||
use function html_entity_decode;
|
use function html_entity_decode;
|
||||||
|
use function mb_convert_encoding;
|
||||||
use function preg_match;
|
use function preg_match;
|
||||||
use function str_contains;
|
use function str_contains;
|
||||||
use function str_starts_with;
|
use function str_starts_with;
|
||||||
use function strtolower;
|
use function strtolower;
|
||||||
use function trim;
|
use function trim;
|
||||||
|
|
||||||
use const Shlinkio\Shlink\TITLE_TAG_VALUE;
|
|
||||||
|
|
||||||
readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionHelperInterface
|
readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionHelperInterface
|
||||||
{
|
{
|
||||||
public const MAX_REDIRECTS = 15;
|
public const MAX_REDIRECTS = 15;
|
||||||
public const CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
public const CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||||
. 'Chrome/121.0.0.0 Safari/537.36';
|
. 'Chrome/121.0.0.0 Safari/537.36';
|
||||||
|
|
||||||
|
// Matches the value inside a html title tag
|
||||||
|
private const TITLE_TAG_VALUE = '/<title[^>]*>(.*?)<\/title>/i';
|
||||||
|
// Matches the charset inside a Content-Type header
|
||||||
|
private const CHARSET_VALUE = '/charset=([^;]+)/i';
|
||||||
|
|
||||||
public function __construct(
|
public function __construct(
|
||||||
private ClientInterface $httpClient,
|
private ClientInterface $httpClient,
|
||||||
private UrlShortenerOptions $options,
|
private UrlShortenerOptions $options,
|
||||||
@ -53,7 +57,7 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
|
|||||||
return $data;
|
return $data;
|
||||||
}
|
}
|
||||||
|
|
||||||
$title = $this->tryToResolveTitle($response);
|
$title = $this->tryToResolveTitle($response, $contentType);
|
||||||
return $title !== null ? $data->withResolvedTitle($title) : $data;
|
return $title !== null ? $data->withResolvedTitle($title) : $data;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -76,7 +80,7 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private function tryToResolveTitle(ResponseInterface $response): ?string
|
private function tryToResolveTitle(ResponseInterface $response, string $contentType): ?string
|
||||||
{
|
{
|
||||||
$collectedBody = '';
|
$collectedBody = '';
|
||||||
$body = $response->getBody();
|
$body = $response->getBody();
|
||||||
@ -84,12 +88,19 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
|
|||||||
while (! str_contains($collectedBody, '</title>') && ! $body->eof()) {
|
while (! str_contains($collectedBody, '</title>') && ! $body->eof()) {
|
||||||
$collectedBody .= $body->read(1024);
|
$collectedBody .= $body->read(1024);
|
||||||
}
|
}
|
||||||
preg_match(TITLE_TAG_VALUE, $collectedBody, $matches);
|
|
||||||
return isset($matches[1]) ? $this->normalizeTitle($matches[1]) : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private function normalizeTitle(string $title): string
|
// Try to match the title from the <title /> tag
|
||||||
{
|
preg_match(self::TITLE_TAG_VALUE, $collectedBody, $titleMatches);
|
||||||
|
if (! isset($titleMatches[1])) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the page's charset from Content-Type header
|
||||||
|
preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches);
|
||||||
|
|
||||||
|
$title = isset($charsetMatches[1])
|
||||||
|
? mb_convert_encoding($titleMatches[1], 'utf8', $charsetMatches[1])
|
||||||
|
: $titleMatches[1];
|
||||||
return html_entity_decode(trim($title));
|
return html_entity_decode(trim($title));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,7 @@ use Laminas\Diactoros\Response;
|
|||||||
use Laminas\Diactoros\Response\JsonResponse;
|
use Laminas\Diactoros\Response\JsonResponse;
|
||||||
use Laminas\Diactoros\Stream;
|
use Laminas\Diactoros\Stream;
|
||||||
use PHPUnit\Framework\Attributes\Test;
|
use PHPUnit\Framework\Attributes\Test;
|
||||||
|
use PHPUnit\Framework\Attributes\TestWith;
|
||||||
use PHPUnit\Framework\MockObject\Builder\InvocationMocker;
|
use PHPUnit\Framework\MockObject\Builder\InvocationMocker;
|
||||||
use PHPUnit\Framework\MockObject\MockObject;
|
use PHPUnit\Framework\MockObject\MockObject;
|
||||||
use PHPUnit\Framework\TestCase;
|
use PHPUnit\Framework\TestCase;
|
||||||
@ -89,10 +90,12 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[Test]
|
#[Test]
|
||||||
public function titleIsUpdatedWhenItCanBeResolvedFromResponse(): void
|
#[TestWith(['TEXT/html; charset=utf-8'], name: 'charset')]
|
||||||
|
#[TestWith(['TEXT/html'], name: 'no charset')]
|
||||||
|
public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType): void
|
||||||
{
|
{
|
||||||
$data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
|
$data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
|
||||||
$this->expectRequestToBeCalled()->willReturn($this->respWithTitle());
|
$this->expectRequestToBeCalled()->willReturn($this->respWithTitle($contentType));
|
||||||
|
|
||||||
$result = $this->helper(autoResolveTitles: true)->processTitle($data);
|
$result = $this->helper(autoResolveTitles: true)->processTitle($data);
|
||||||
|
|
||||||
@ -122,10 +125,10 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
|
|||||||
return new Response($body, 200, ['Content-Type' => 'text/html']);
|
return new Response($body, 200, ['Content-Type' => 'text/html']);
|
||||||
}
|
}
|
||||||
|
|
||||||
private function respWithTitle(): Response
|
private function respWithTitle(string $contentType): Response
|
||||||
{
|
{
|
||||||
$body = $this->createStreamWithContent('<title data-foo="bar"> Resolved "title" </title>');
|
$body = $this->createStreamWithContent('<title data-foo="bar"> Resolved "title" </title>');
|
||||||
return new Response($body, 200, ['Content-Type' => 'TEXT/html; charset=utf-8']);
|
return new Response($body, 200, ['Content-Type' => $contentType]);
|
||||||
}
|
}
|
||||||
|
|
||||||
private function createStreamWithContent(string $content): Stream
|
private function createStreamWithContent(string $content): Stream
|
||||||
|
Loading…
Reference in New Issue
Block a user