-
Notifications
You must be signed in to change notification settings - Fork 8.3k
A query does not work with parallel replicas. #65587
Copy link
Copy link
Open
Labels
bugConfirmed user-visible misbehaviour in official releaseConfirmed user-visible misbehaviour in official releasecomp-parallel-replicasParallel reading from replicas for query speed (not data replication consistency).Parallel reading from replicas for query speed (not data replication consistency).
Description
CREATE TABLE minicrawl
(
date Date DEFAULT today(),
rank UInt32,
domain String,
log String CODEC(ZSTD(6)),
content String CODEC(ZSTD(6))
)
ENGINE = MergeTree
ORDER BY (domain, date);
CREATE TABLE minicrawl_processed
(
date Date,
rank String,
domain String,
is_idn Bool,
top_level_domain LowCardinality(String),
success Bool,
log String CODEC(ZSTD(6)),
title String,
text String CODEC(ZSTD(6)),
converted_content String CODEC(ZSTD(6)),
curl_error String,
curl_error_code Enum(
'OK' = 0,
'UNSUPPORTED_PROTOCOL' = 1,
'FAILED_INIT' = 2,
'URL_MALFORMAT' = 3,
'NOT_BUILT_IN' = 4,
'COULDNT_RESOLVE_PROXY' = 5,
'COULDNT_RESOLVE_HOST' = 6,
'COULDNT_CONNECT' = 7,
'WEIRD_SERVER_REPLY' = 8,
'REMOTE_ACCESS_DENIED' = 9,
'FTP_ACCEPT_FAILED' = 10,
'FTP_WEIRD_PASS_REPLY' = 11,
'FTP_ACCEPT_TIMEOUT' = 12,
'FTP_WEIRD_PASV_REPLY' = 13,
'FTP_WEIRD_227_FORMAT' = 14,
'FTP_CANT_GET_HOST' = 15,
'HTTP2' = 16,
'FTP_COULDNT_SET_TYPE' = 17,
'PARTIAL_FILE' = 18,
'FTP_COULDNT_RETR_FILE' = 19,
'QUOTE_ERROR' = 21,
'HTTP_RETURNED_ERROR' = 22,
'WRITE_ERROR' = 23,
'UPLOAD_FAILED' = 25,
'READ_ERROR' = 26,
'OUT_OF_MEMORY' = 27,
'OPERATION_TIMEDOUT' = 28,
'FTP_PORT_FAILED' = 30,
'FTP_COULDNT_USE_REST' = 31,
'RANGE_ERROR' = 33,
'HTTP_POST_ERROR' = 34,
'SSL_CONNECT_ERROR' = 35,
'BAD_DOWNLOAD_RESUME' = 36,
'FILE_COULDNT_READ_FILE' = 37,
'LDAP_CANNOT_BIND' = 38,
'LDAP_SEARCH_FAILED' = 39,
'FUNCTION_NOT_FOUND' = 41,
'ABORTED_BY_CALLBACK' = 42,
'BAD_FUNCTION_ARGUMENT' = 43,
'INTERFACE_FAILED' = 45,
'TOO_MANY_REDIRECTS' = 47,
'UNKNOWN_OPTION' = 48,
'TELNET_OPTION_SYNTAX' = 49,
'GOT_NOTHING' = 52,
'SSL_ENGINE_NOTFOUND' = 53,
'SSL_ENGINE_SETFAILED' = 54,
'SEND_ERROR' = 55,
'RECV_ERROR' = 56,
'SSL_CERTPROBLEM' = 58,
'SSL_CIPHER' = 59,
'PEER_FAILED_VERIFICATION' = 60,
'BAD_CONTENT_ENCODING' = 61,
'LDAP_INVALID_URL' = 62,
'FILESIZE_EXCEEDED' = 63,
'USE_SSL_FAILED' = 64,
'SEND_FAIL_REWIND' = 65,
'SSL_ENGINE_INITFAILED' = 66,
'LOGIN_DENIED' = 67,
'TFTP_NOTFOUND' = 68,
'TFTP_PERM' = 69,
'REMOTE_DISK_FULL' = 70,
'TFTP_ILLEGAL' = 71,
'TFTP_UNKNOWNID' = 72,
'REMOTE_FILE_EXISTS' = 73,
'TFTP_NOSUCHUSER' = 74,
'CONV_FAILED' = 75,
'CONV_REQD' = 76,
'SSL_CACERT_BADFILE' = 77,
'REMOTE_FILE_NOT_FOUND' = 78,
'SSH' = 79,
'SSL_SHUTDOWN_FAILED' = 80,
'AGAIN' = 81,
'SSL_CRL_BADFILE' = 82,
'SSL_ISSUER_ERROR' = 83,
'FTP_PRET_FAILED' = 84,
'RTSP_CSEQ_ERROR' = 85,
'RTSP_SESSION_ERROR' = 86,
'FTP_BAD_FILE_LIST' = 87,
'CHUNK_FAILED' = 88,
'NO_CONNECTION_AVAILABLE' = 89,
'SSL_PINNEDPUBKEYNOTMATCH' = 90,
'SSL_INVALIDCERTSTATUS' = 91,
'HTTP2_STREAM' = 92,
'RECURSIVE_API_CALL' = 93,
'AUTH_ERROR' = 94,
'HTTP3' = 95
),
num_redirects UInt8,
final_url String,
final_url_domain String,
redirect_urls Array(String),
resolved_address String,
resolved_address_ipv4 IPv4,
resolved_address_ipv6 IPv6,
final_resolved_address String,
final_resolved_address_ipv4 IPv4,
final_resolved_address_ipv6 IPv6,
is_https Bool,
http_version LowCardinality(String),
http_code UInt16,
http_message LowCardinality(String),
all_header_names Array(LowCardinality(String)),
headers Map(LowCardinality(String), String),
last_headers Map(LowCardinality(String), String),
server_full LowCardinality(String),
server LowCardinality(String),
server_time DateTime,
last_modified DateTime,
expires DateTime,
connection LowCardinality(String),
content_type_all LowCardinality(String),
content_type LowCardinality(String),
content_type_charset LowCardinality(String),
real_content_length UInt32,
header_content_length UInt32,
transfer_encoding LowCardinality(String),
powered_by_full LowCardinality(String),
powered_by LowCardinality(String),
redirect_by LowCardinality(String),
age UInt32,
has_etag Bool,
has_csp Bool,
has_hsts Bool,
hsts_max_age UInt32,
hsts_include_subdomains Bool,
csp_report String,
keep_alive_timeout UInt32,
keep_alive_max UInt32,
via String,
header_content_language LowCardinality(String),
header_content_country LowCardinality(String),
accept_client_hints String,
accept_ranges Bool,
upgrade_protocols Array(LowCardinality(String)),
tls_version LowCardinality(String),
tls_cipher LowCardinality(String),
alpn_protocol LowCardinality(String),
certificate_ok Bool,
certificate_cn String,
certificate_issuer LowCardinality(String),
certificate_issuer_country LowCardinality(String),
certificate_start_date DateTime,
certificate_expire_date DateTime,
doctype_tag LowCardinality(String),
doctype LowCardinality(String),
doctype_id LowCardinality(String),
doctype_url LowCardinality(String),
is_utf8 Bool,
unique_tags Array(LowCardinality(String)),
tags_attributes Array(Tuple(LowCardinality(String), Map(LowCardinality(String), String))),
links Array(String),
images Array(String),
scripts Array(String),
iframes Array(String),
stylesheets Array(String),
external_links Array(String),
external_images Array(String),
external_scripts Array(String),
external_iframes Array(String),
external_stylesheets Array(String),
external_links_domains Array(String),
external_images_domains Array(String),
external_scripts_domains Array(String),
external_iframes_domains Array(String),
external_stylesheets_domains Array(String),
script_languages Array(LowCardinality(String)),
icon String,
xml_namespaces Array(LowCardinality(String)),
bom Enum('', 'little', 'big', 'utf8'),
html_language_full LowCardinality(String),
html_language LowCardinality(String),
html_language_country LowCardinality(String),
meta Map(LowCardinality(String), String),
generator LowCardinality(String),
nel_url String,
nel_group LowCardinality(String)
)
ENGINE = MergeTree
ORDER BY (domain, date);
SET max_ast_elements = 100000;
INSERT INTO minicrawl_processed
WITH
domain LIKE '%xn--%' AS is_idn,
topLevelDomain(domain) AS top_level_domain,
extract(log, 'curl: (\(\d+\) [^\n]+)') AS curl_error,
toUInt8OrZero(extract(log, 'curl: \((\d+)\) [^\n]+')) AS curl_error_code,
curl_error_code = 0 AS success,
countSubstrings(log, 'Issue another request to this URL') AS num_redirects,
(empty(extractAll(log, '[Ll]ocation: (\S+)')[-1] AS final_redirect) ? 'http://' || domain || '/' : final_redirect) AS final_url,
domain(final_url) AS final_url_domain,
extractAll(log, '[Ll]ocation: (\S+)') AS redirect_urls,
extract(log, '\* Trying ([0-9a-z\.\:\[\]]+):\d+\.\.\.') AS resolved_address,
toIPv4OrZero(resolved_address) AS resolved_address_ipv4,
toIPv6OrZero(resolved_address) AS resolved_address_ipv6,
extractAll(log, '\* Trying ([0-9a-z\.\:\[\]]+):\d+\.\.\.')[-1] AS final_resolved_address,
toIPv4OrZero(final_resolved_address) AS final_resolved_address_ipv4,
toIPv6OrZero(final_resolved_address) AS final_resolved_address_ipv6,
log LIKE '%TLS handshake%' AS is_https,
extractAll(log, '< HTTP/([\d\.]+)')[-1] AS http_version,
toUInt16OrZero(extractAll(log, '< HTTP/[\d\.]+ (\d+)')[-1]) AS http_code,
extractAll(log, '< HTTP/[\\d\\.]+ \\d+(?: ([^\r\n]+))?')[-1] AS http_message,
arrayMap(x -> lower(x), extractAll(log, '< ([\w\-]+): [^\r\n]+')) AS all_header_names,
arrayMap(x -> CAST((x[1], x[2]) AS Tuple(name String, value String)), extractAllGroupsVertical(log, '< ([\w\-]+): ([^\r\n]+)')) AS all_headers,
extractAllGroupsHorizontal(log, '< ([\w\-]+): ([^\r\n]+)') AS headers_arrays,
CAST((arrayMap(x -> lower(x), headers_arrays[1]), headers_arrays[2]) AS Map(String, String)) AS headers,
CAST(arrayFilter((kv, i) -> i = 1, reverse(CAST(headers AS Array(Tuple(String, String)))), arrayEnumerateUniq(reverse(mapKeys(headers)))) AS Map(String, String)) AS last_headers,
last_headers['server'] AS server_full,
extract(server_full, '^[^/ ]+') AS server,
parseDateTimeBestEffortOrZero(last_headers['date']) AS server_time,
parseDateTimeBestEffortOrZero(last_headers['last-modified']) AS last_modified,
parseDateTimeBestEffortOrZero(last_headers['expires']) AS expires,
last_headers['connection'] AS connection,
last_headers['content-type'] AS content_type_all,
lower(extract(content_type_all, '^([^;]+)[;,]?')) AS content_type,
lower(extract(content_type_all, 'charset=([\w\-]+)')) AS content_type_charset,
length(content) AS real_content_length,
toUInt32OrZero(last_headers['content-length']) AS header_content_length,
multiIf(
content_type_charset IN ('gbk', 'gb2312', 'gb18030'), convertCharset(content, 'gb18030', 'utf-8'),
content_type_charset IN ('euc-jp', 'eucjp'), convertCharset(content, 'euc-jp', 'utf-8'),
content_type_charset IN ('euc-kr', 'euckr'), convertCharset(content, 'euc-kr', 'utf-8'),
content_type_charset IN ('shift-jis', 'shift_jis', 'sjis', 'cp932'), convertCharset(content, 'shift-jis', 'utf-8'),
content_type_charset IN ('tis-620'), convertCharset(content, 'tis-620', 'utf-8'),
content_type_charset IN ('big5'), convertCharset(content, 'big5', 'utf-8'),
content_type_charset IN ('koi8-r'), convertCharset(content, 'koi8-r', 'utf-8'),
content_type_charset IN ('koi8-u'), convertCharset(content, 'koi8-u', 'utf-8'),
content_type_charset IN ('iso-8859-1', 'latin1', 'iso8859-1'), convertCharset(content, 'iso-8859-1', 'utf-8'),
content_type_charset IN ('iso-8859-2', 'latin2'), convertCharset(content, 'iso-8859-2', 'utf-8'),
content_type_charset IN ('iso-8859-7'), convertCharset(content, 'iso-8859-7', 'utf-8'),
content_type_charset IN ('iso-8859-9', 'latin5'), convertCharset(content, 'iso-8859-9', 'utf-8'),
content_type_charset IN ('iso-8859-15', 'latin9', 'latin0', 'latin6'), convertCharset(content, 'iso-8859-15', 'utf-8'),
content_type_charset IN ('windows-1250', 'windows1250', 'cp1250'), convertCharset(content, 'cp1250', 'utf-8'),
content_type_charset IN ('windows-1251', 'windows1251', 'cp1251'), convertCharset(content, 'cp1251', 'utf-8'),
content_type_charset IN ('windows-1252', 'windows1252', 'cp1252'), convertCharset(content, 'cp1252', 'utf-8'),
content_type_charset IN ('windows-1253', 'windows1253', 'cp1253'), convertCharset(content, 'cp1253', 'utf-8'),
content_type_charset IN ('windows-1254', 'windows1254', 'cp1254'), convertCharset(content, 'cp1254', 'utf-8'),
content_type_charset IN ('windows-1255', 'windows1255', 'cp1255'), convertCharset(content, 'cp1255', 'utf-8'),
content_type_charset IN ('windows-1256', 'windows1256', 'cp1256'), convertCharset(content, 'cp1256', 'utf-8'),
content_type_charset IN ('windows-1257', 'windows1257', 'cp1257'), convertCharset(content, 'cp1257', 'utf-8'),
content) AS converted_content,
last_headers['transfer-encoding'] AS transfer_encoding,
last_headers['x-powered-by'] AS powered_by_full,
extract(powered_by_full, '^[^/ ]+') AS powered_by,
last_headers['x-redirect-by'] AS redirect_by,
toInt32OrZero(last_headers['age']) AS age,
mapContains(last_headers, 'etag') AS has_etag,
mapContains(last_headers, 'content-security-policy') AS has_csp,
mapContains(last_headers, 'strict-transport-security') AS has_hsts,
toUInt32OrZero(extract(last_headers['strict-transport-security'], 'max-age=(\d+)')) AS hsts_max_age,
last_headers['strict-transport-security'] LIKE '%includeSubDomains%' AS hsts_include_subdomains,
extract(last_headers['content-security-policy'], 'report-uri ?(http[^ ;]+)') AS csp_report_uri,
extract(last_headers['content-security-policy'], 'report-to .+?"url": ?"([^"]+)"') AS csp_report_to,
empty(csp_report_to) ? csp_report_uri : csp_report_to AS csp_report,
last_headers['keep-alive'] AS keep_alive,
toUInt32OrZero(extract(keep_alive, 'timeout=(\d+)')) AS keep_alive_timeout,
toUInt32OrZero(extract(keep_alive, 'max=(\d+)')) AS keep_alive_max,
last_headers['via'] AS via,
last_headers['content-language'] AS header_content_language_full,
lower(extract(header_content_language_full, '^\w\w')) AS header_content_language,
lower(extract(header_content_language_full, '-(\w\w)')) AS header_content_country,
last_headers['accept-ch'] AS accept_client_hints,
lower(last_headers['accept-ranges']) = 'bytes' AS accept_ranges,
splitByRegexp(', *', last_headers['upgrade']) AS upgrade_protocols,
extract(log, '\* SSL connection using (\S+) ') AS tls_full,
extract(tls_full, 'TLSv([\d\.]+)') AS tls_version,
extract(log, '\* SSL connection using \S+ / (\w+)') AS tls_cipher,
extract(log, '\* ALPN, server accepted to use (\S+)') AS alpn_protocol,
log LIKE '%\n* SSL certificate verify ok%' AS certificate_ok,
extract(log, '\n\* subject:.+? CN=([^;\s]+)') AS certificate_cn,
extract(log, '\n\* .+O=([^;\n]+)') AS certificate_issuer,
extract(log, '\n\* .+C=([^;\n]+)') AS certificate_issuer_country,
parseDateTimeBestEffortOrZero(extract(log, '\n\* start date: ([^;\n]+)\n')) AS certificate_start_date,
parseDateTimeBestEffortOrZero(extract(log, '\n\* expire date: ([^;\n]+)\n')) AS certificate_expire_date,
extract(converted_content, '(?i)<!DOCTYPE [^>]+>') AS doctype_tag,
extract(lower(doctype_tag), '<!doctype (\w+)') AS doctype,
replaceRegexpAll(extract(lower(doctype_tag), '"-//w3c//dtd ([^/"]+)'), '\s+', ' ') AS doctype_id,
extract(lower(doctype_tag), '"([^"]+.dtd)"') AS doctype_url,
decodeHTMLComponent(extractTextFromHTML(converted_content)) AS text,
isValidUTF8(content) AS is_utf8,
extractAll(converted_content, '<[\w:\-]+(?: [^>]*)?>') AS tags,
arrayDistinct(arrayMap(x -> lower(extract(x, '<([\w:\-]+)')), tags)) AS unique_tags,
arrayMap(tag -> (extract(tag, '^<([\w:\-]+)'), arrayMap(kv -> ((extractGroups(kv, '([\w:\-]+)=["\'`]?([^>"\'`]*)["\'`]?') AS groups)[1], groups[2]), extractAll(tag, '\s+([\w:\-]+=(?:["\'`][^>"\'`]*["\'`]|[^"\'`\s]\S+))'))::Map(String, String)), tags) AS tags_attributes,
arrayMap(x -> x.2['href'], arrayFilter(x -> x.1 = 'a' AND mapContains(x.2, 'href'), tags_attributes)) AS links,
arrayMap(x -> x.2['src'], arrayFilter(x -> x.1 = 'img' AND mapContains(x.2, 'src'), tags_attributes)) AS images,
arrayMap(x -> x.2['src'], arrayFilter(x -> x.1 = 'script' AND mapContains(x.2, 'src'), tags_attributes)) AS scripts,
arrayMap(x -> x.2['src'], arrayFilter(x -> x.1 = 'iframe' AND mapContains(x.2, 'src'), tags_attributes)) AS iframes,
arrayMap(x -> x.2['href'], arrayFilter(x -> x.1 = 'link' AND x.2['rel'] = 'stylesheet' AND mapContains(x.2, 'href'), tags_attributes)) AS stylesheets,
arrayDistinct(arrayMap(x -> x LIKE '//%' ? 'https:' || x : x, arrayFilter(x -> match(x, '^(?:[\w\-]+:)?//'), links))) AS external_links,
arrayDistinct(arrayMap(x -> x LIKE '//%' ? 'https:' || x : x, arrayFilter(x -> match(x, '^(?:[\w\-]+:)?//'), images))) AS external_images,
arrayDistinct(arrayMap(x -> x LIKE '//%' ? 'https:' || x : x, arrayFilter(x -> match(x, '^(?:[\w\-]+:)?//'), scripts))) AS external_scripts,
arrayDistinct(arrayMap(x -> x LIKE '//%' ? 'https:' || x : x, arrayFilter(x -> match(x, '^(?:[\w\-]+:)?//'), iframes))) AS external_iframes,
arrayDistinct(arrayMap(x -> x LIKE '//%' ? 'https:' || x : x, arrayFilter(x -> match(x, '^(?:[\w\-]+:)?//'), stylesheets))) AS external_stylesheets,
arrayDistinct(arrayMap(x -> domainWithoutWWW(x), external_links)) AS external_links_domains,
arrayDistinct(arrayMap(x -> domainWithoutWWW(x), external_images)) AS external_images_domains,
arrayDistinct(arrayMap(x -> domainWithoutWWW(x), external_scripts)) AS external_scripts_domains,
arrayDistinct(arrayMap(x -> domainWithoutWWW(x), external_iframes)) AS external_iframes_domains,
arrayDistinct(arrayMap(x -> domainWithoutWWW(x), external_stylesheets)) AS external_stylesheets_domains,
arrayDistinct(arrayMap(x -> x.2['type'] != '' ? x.2['type'] : x.2['language'], arrayFilter(x -> x.1 = 'script' AND (mapContains(x.2, 'type') OR mapContains(x.2, 'language')), tags_attributes))) AS script_languages,
arrayFirst(x -> x.1 = 'link' AND mapContains(x.2, 'href') AND x.2['rel'] ILIKE '%icon%', tags_attributes).2['href'] AS icon,
arrayMap(x -> (x[1], x[2]), arrayDistinct(extractAllGroupsVertical(converted_content, '\sxmlns(?:\:([\w\-]+))?=["\'`](http[^>"\'`]*)["\'`]'))) AS xml_namespaces,
(startsWith(content, '\xFE\xFF') OR startsWith(content, '\x00\x00\xFE\xFF') ? 'big'
: (startsWith(content, '\xFF\xFE') ? 'little'
: (startsWith(content, '\xEF\xBB\xBF') ? 'utf8' : ''))) AS bom,
arrayFirst(x -> x.1 = 'html', tags_attributes).2['lang'] AS html_language_full,
lower(extract(html_language_full, '^\w\w')) AS html_language,
lower(extract(html_language_full, '-(\w\w)')) AS html_language_country,
decodeHTMLComponent(extract(converted_content, '(?i)<title>\s*([^<]+)\s*</title>')) AS title,
arrayMap(x -> (x.2['name'] != '' ? x.2['name'] : x.2['property'], decodeHTMLComponent(x.2['content'])), arrayFilter(x -> x.1 = 'meta' AND (mapContains(x.2, 'name') OR mapContains(x.2, 'property')), tags_attributes))::Map(String, String) AS meta,
meta['generator'] AS generator,
cutQueryString(JSON_VALUE(last_headers['report-to'], '$.endpoints[*].url')) AS nel_url,
cutQueryString(JSON_VALUE(last_headers['report-to'], '$.group')) AS nel_group,
1 SELECT
date,
rank,
domain,
is_idn,
top_level_domain,
success,
log,
title,
text,
converted_content,
curl_error,
curl_error_code,
num_redirects,
final_url,
final_url_domain,
redirect_urls,
resolved_address,
resolved_address_ipv4,
resolved_address_ipv6,
final_resolved_address,
final_resolved_address_ipv4,
final_resolved_address_ipv6,
is_https,
http_version,
http_code,
http_message,
all_header_names,
headers,
last_headers,
server_full,
server,
server_time,
last_modified,
expires,
connection,
content_type_all,
content_type,
content_type_charset,
real_content_length,
header_content_length,
transfer_encoding,
powered_by_full,
powered_by,
redirect_by,
age,
has_etag,
has_csp,
has_hsts,
hsts_max_age,
hsts_include_subdomains,
csp_report,
keep_alive_timeout,
keep_alive_max,
via,
header_content_language,
header_content_country,
accept_client_hints,
accept_ranges,
upgrade_protocols,
tls_version,
tls_cipher,
alpn_protocol,
certificate_ok,
certificate_cn,
certificate_issuer,
certificate_issuer_country,
certificate_start_date,
certificate_expire_date,
doctype_tag,
doctype,
doctype_id,
doctype_url,
is_utf8,
unique_tags,
tags_attributes,
links,
images,
scripts,
iframes,
stylesheets,
external_links,
external_images,
external_scripts,
external_iframes,
external_stylesheets,
external_links_domains,
external_images_domains,
external_scripts_domains,
external_iframes_domains,
external_stylesheets_domains,
script_languages,
icon,
xml_namespaces,
bom,
html_language_full,
html_language,
html_language_country,
meta,
generator,
nel_url,
nel_group
FROM minicrawl;
Yields this error:
Received exception from server (version 24.5.1): Code: 47. DB::Exception: Received from nl41c6o1h4.eu-west-1.aws.clickhouse-staging.com:9440. DB::Exception: Received from c-claret-aa-77-server-2zfz4h6-0.c-claret-aa-77-server-headless.ns-claret-aa-77.svc.cluster.local:9000. DB::Exception: Unknown expression or function identifier 'groups' in scope kv -> (groups[1], groups[2]). (UNKNOWN_IDENTIFIER)
Unless I disable parallel replicas.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugConfirmed user-visible misbehaviour in official releaseConfirmed user-visible misbehaviour in official releasecomp-parallel-replicasParallel reading from replicas for query speed (not data replication consistency).Parallel reading from replicas for query speed (not data replication consistency).