目录
(1)安装 ClickHouse Server 和 ClickHouse Client
(1)在新主机(node4)上安装 ClickHouse Server 和 ClickHouse Client
5. 使用 ReplicatedMergeTree 表引擎复制数据
ClickHouse 可以在任何具有x86_64、AArch64 或 PowerPC64LE CPU 架构的 Linux,FreeBSD 或 Mac OS X 上运行。官方预构建的二进制文件通常针对 x86_64 进行编译,并利用 SSE 4.2 指令集,因此,除非另有说明,支持它的 CPU 使用将成为额外的系统需求。下面是检查当前 CPU 是否支持 SSE 4.2 的命令:
$ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported"
一、单节点设置
1. 下载
# 查看版本
https://github.com/ClickHouse/ClickHouse/releases
# 最新稳定版本安装包下载地址
https://github.com/ClickHouse/ClickHouse/releases/download/v24.1.8.22-stable/clickhouse-client-24.1.8.22.x86_64.rpm
https://github.com/ClickHouse/ClickHouse/releases/download/v24.1.8.22-stable/clickhouse-common-static-24.1.8.22.x86_64.rpm
https://github.com/ClickHouse/ClickHouse/releases/download/v24.1.8.22-stable/clickhouse-server-24.1.8.22.x86_64.rpm
2. 安装
# 首先安装通用依赖包
rpm -ivh clickhouse-common-static-24.1.8.22.x86_64.rpm
# 安装服务器,遇到 Enter password for default user 提示时输入密码
rpm -ivh clickhouse-server-24.1.8.22.x86_64.rpm
# 安装命令行客户端
rpm -ivh clickhouse-client-24.1.8.22.x86_64.rpm
3. 启动
sudo service clickhouse-server start
查看启动后进程:
[root@vvml-yz-hbase-test~]#ps -ef | grep clickhouse | grep -v grep
clickho+ 5322 1 0 08:49 ? 00:00:00 clickhouse-watchdog --config-file /etc/clickhouse-server/config.xml --pid-file /var/run/clickhouse-server/clickhouse-server.pid --daemon
clickho+ 5323 5322 6 08:49 ? 00:00:00 /usr/bin/clickhouse-server --config-file /etc/clickhouse-server/config.xml --pid-file /var/run/clickhouse-server/clickhouse-server.pid --daemon
[root@vvml-yz-hbase-test~]#
查看监听端口
[root@vvml-yz-hbase-test~]#netstat -antpl | grep clickhouse
tcp 0 0 127.0.0.1:9004 0.0.0.0:* LISTEN 5323/clickhouse-ser
tcp 0 0 127.0.0.1:9005 0.0.0.0:* LISTEN 5323/clickhouse-ser
tcp 0 0 127.0.0.1:9009 0.0.0.0:* LISTEN 5323/clickhouse-ser
tcp 0 0 127.0.0.1:8123 0.0.0.0:* LISTEN 5323/clickhouse-ser
tcp 0 0 127.0.0.1:9000 0.0.0.0:* LISTEN 5323/clickhouse-ser
tcp6 0 0 ::1:9004 :::* LISTEN 5323/clickhouse-ser
tcp6 0 0 ::1:9005 :::* LISTEN 5323/clickhouse-ser
tcp6 0 0 ::1:9009 :::* LISTEN 5323/clickhouse-ser
tcp6 0 0 ::1:8123 :::* LISTEN 5323/clickhouse-ser
tcp6 0 0 ::1:9000 :::* LISTEN 5323/clickhouse-ser
[root@vvml-yz-hbase-test~]#
服务端日志的默认位置是 /var/log/clickhouse-server/。当服务端在日志中记录 Ready for connections 消息,即表示服务端已准备好处理客户端连接。一旦 clickhouse-server 启动并运行,可以利用 clickhouse-client 连接到服务端,并运行一些测试查询。
4. 验证
[root@vvml-yz-hbase-test~]#clickhouse-client --password="123456" --query "select version();"
24.1.8.22
[root@vvml-yz-hbase-test~]#clickhouse-client --password="123456" -n --query "select 1;select 2;"
1
2
[root@vvml-yz-hbase-test~]#
二、导入示例数据集
1. 下载并提取表数据
curl https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv
curl https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv
2. 创建库表
创建 sample.sql 文件,内容如下(注意嵌套表定义):
create database if not exists tutorial;
CREATE TABLE tutorial.hits_v1
(
`WatchID` UInt64,
`JavaEnable` UInt8,
`Title` String,
`GoodEvent` Int16,
`EventTime` DateTime,
`EventDate` Date,
`CounterID` UInt32,
`ClientIP` UInt32,
`ClientIP6` FixedString(16),
`RegionID` UInt32,
`UserID` UInt64,
`CounterClass` Int8,
`OS` UInt8,
`UserAgent` UInt8,
`URL` String,
`Referer` String,
`URLDomain` String,
`RefererDomain` String,
`Refresh` UInt8,
`IsRobot` UInt8,
`RefererCategories` Array(UInt16),
`URLCategories` Array(UInt16),
`URLRegions` Array(UInt32),
`RefererRegions` Array(UInt32),
`ResolutionWidth` UInt16,
`ResolutionHeight` UInt16,
`ResolutionDepth` UInt8,
`FlashMajor` UInt8,
`FlashMinor` UInt8,
`FlashMinor2` String,
`NetMajor` UInt8,
`NetMinor` UInt8,
`UserAgentMajor` UInt16,
`UserAgentMinor` FixedString(2),
`CookieEnable` UInt8,
`JavascriptEnable` UInt8,
`IsMobile` UInt8,
`MobilePhone` UInt8,
`MobilePhoneModel` String,
`Params` String,
`IPNetworkID` UInt32,
`TraficSourceID` Int8,
`SearchEngineID` UInt16,
`SearchPhrase` String,
`AdvEngineID` UInt8,
`IsArtifical` UInt8,
`WindowClientWidth` UInt16,
`WindowClientHeight` UInt16,
`ClientTimeZone` Int16,
`ClientEventTime` DateTime,
`SilverlightVersion1` UInt8,
`SilverlightVersion2` UInt8,
`SilverlightVersion3` UInt32,
`SilverlightVersion4` UInt16,
`PageCharset` String,
`CodeVersion` UInt32,
`IsLink` UInt8,
`IsDownload` UInt8,
`IsNotBounce` UInt8,
`FUniqID` UInt64,
`HID` UInt32,
`IsOldCounter` UInt8,
`IsEvent` UInt8,
`IsParameter` UInt8,
`DontCountHits` UInt8,
`WithHash` UInt8,
`HitColor` FixedString(1),
`UTCEventTime` DateTime,
`Age` UInt8,
`Sex` UInt8,
`Income` UInt8,
`Interests` UInt16,
`Robotness` UInt8,
`GeneralInterests` Array(UInt16),
`RemoteIP` UInt32,
`RemoteIP6` FixedString(16),
`WindowName` Int32,
`OpenerName` Int32,
`HistoryLength` Int16,
`BrowserLanguage` FixedString(2),
`BrowserCountry` FixedString(2),
`SocialNetwork` String,
`SocialAction` String,
`HTTPError` UInt16,
`SendTiming` Int32,
`DNSTiming` Int32,
`ConnectTiming` Int32,
`ResponseStartTiming` Int32,
`ResponseEndTiming` Int32,
`FetchTiming` Int32,
`RedirectTiming` Int32,
`DOMInteractiveTiming` Int32,
`DOMContentLoadedTiming` Int32,
`DOMCompleteTiming` Int32,
`LoadEventStartTiming` Int32,
`LoadEventEndTiming` Int32,
`NSToDOMContentLoadedTiming` Int32,
`FirstPaintTiming` Int32,
`RedirectCount` Int8,
`SocialSourceNetworkID` UInt8,
`SocialSourcePage` String,
`ParamPrice` Int64,
`ParamOrderID` String,
`ParamCurrency` FixedString(3),
`ParamCurrencyID` UInt16,
`GoalsReached` Array(UInt32),
`OpenstatServiceName` String,
`OpenstatCampaignID` String,
`OpenstatAdID` String,
`OpenstatSourceID` String,
`UTMSource` String,
`UTMMedium` String,
`UTMCampaign` String,
`UTMContent` String,
`UTMTerm` String,
`FromTag` String,
`HasGCLID` UInt8,
`RefererHash` UInt64,
`URLHash` UInt64,
`CLID` UInt32,
`YCLID` UInt64,
`ShareService` String,
`ShareURL` String,
`ShareTitle` String,
`ParsedParams` Nested(
Key1 String,
Key2 String,
Key3 String,
Key4 String,
Key5 String,
ValueDouble Float64),
`IslandID` FixedString(16),
`RequestNum` UInt32,
`RequestTry` UInt8
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(EventDate)
ORDER BY (CounterID, EventDate, intHash32(UserID))
SAMPLE BY intHash32(UserID);
CREATE TABLE tutorial.visits_v1
(
`CounterID` UInt32,
`StartDate` Date,
`Sign` Int8,
`IsNew` UInt8,
`VisitID` UInt64,
`UserID` UInt64,
`StartTime` DateTime,
`Duration` UInt32,
`UTCStartTime` DateTime,
`PageViews` Int32,
`Hits` Int32,
`IsBounce` UInt8,
`Referer` String,
`StartURL` String,
`RefererDomain` String,
`StartURLDomain` String,
`EndURL` String,
`LinkURL` String,
`IsDownload` UInt8,
`TraficSourceID` Int8,
`SearchEngineID` UInt16,
`SearchPhrase` String,
`AdvEngineID` UInt8,
`PlaceID` Int32,
`RefererCategories` Array(UInt16),
`URLCategories` Array(UInt16),
`URLRegions` Array(UInt32),
`RefererRegions` Array(UInt32),
`IsYandex` UInt8,
`GoalReachesDepth` Int32,
`GoalReachesURL` Int32,
`GoalReachesAny` Int32,
`SocialSourceNetworkID` UInt8,
`SocialSourcePage` String,
`MobilePhoneModel` String,
`ClientEventTime` DateTime,
`RegionID` UInt32,
`ClientIP` UInt32,
`ClientIP6` FixedString(16),
`RemoteIP` UInt32,
`RemoteIP6` FixedString(16),
`IPNetworkID` UInt32,
`SilverlightVersion3` UInt32,
`CodeVersion` UInt32,
`ResolutionWidth` UInt16,
`ResolutionHeight` UInt16,
`UserAgentMajor` UInt16,
`UserAgentMinor` UInt16,
`WindowClientWidth` UInt16,
`WindowClientHeight` UInt16,
`SilverlightVersion2` UInt8,
`SilverlightVersion4` UInt16,
`FlashVersion3` UInt16,
`FlashVersion4` UInt16,
`ClientTimeZone` Int16,
`OS` UInt8,
`UserAgent` UInt8,
`ResolutionDepth` UInt8,
`FlashMajor` UInt8,
`FlashMinor` UInt8,
`NetMajor` UInt8,
`NetMinor` UInt8,
`MobilePhone` UInt8,
`SilverlightVersion1` UInt8,
`Age` UInt8,
`Sex` UInt8,
`Income` UInt8,
`JavaEnable` UInt8,
`CookieEnable` UInt8,
`JavascriptEnable` UInt8,
`IsMobile` UInt8,
`BrowserLanguage` UInt16,
`BrowserCountry` UInt16,
`Interests` UInt16,
`Robotness` UInt8,
`GeneralInterests` Array(UInt16),
`Params` Array(String),
`Goals` Nested(
ID UInt32,
Serial UInt32,
EventTime DateTime,
Price Int64,
OrderID String,
CurrencyID UInt32),
`WatchIDs` Array(UInt64),
`ParamSumPrice` Int64,
`ParamCurrency` FixedString(3),
`ParamCurrencyID` UInt16,
`ClickLogID` UInt64,
`ClickEventID` Int32,
`ClickGoodEvent` Int32,
`ClickEventTime` DateTime,
`ClickPriorityID` Int32,
`ClickPhraseID` Int32,
`ClickPageID` Int32,
`ClickPlaceID` Int32,
`ClickTypeID` Int32,
`ClickResourceID` Int32,
`ClickCost` UInt32,
`ClickClientIP` UInt32,
`ClickDomainID` UInt32,
`ClickURL` String,
`ClickAttempt` UInt8,
`ClickOrderID` UInt32,
`ClickBannerID` UInt32,
`ClickMarketCategoryID` UInt32,
`ClickMarketPP` UInt32,
`ClickMarketCategoryName` String,
`ClickMarketPPName` String,
`ClickAWAPSCampaignName` String,
`ClickPageName` String,
`ClickTargetType` UInt16,
`ClickTargetPhraseID` UInt64,
`ClickContextType` UInt8,
`ClickSelectType` Int8,
`ClickOptions` String,
`ClickGroupBannerID` Int32,
`OpenstatServiceName` String,
`OpenstatCampaignID` String,
`OpenstatAdID` String,
`OpenstatSourceID` String,
`UTMSource` String,
`UTMMedium` String,
`UTMCampaign` String,
`UTMContent` String,
`UTMTerm` String,
`FromTag` String,
`HasGCLID` UInt8,
`FirstVisit` DateTime,
`PredLastVisit` Date,
`LastVisit` Date,
`TotalVisits` UInt32,
`TraficSource` Nested(
ID Int8,
SearchEngineID UInt16,
AdvEngineID UInt8,
PlaceID UInt16,
SocialSourceNetworkID UInt8,
Domain String,
SearchPhrase String,
SocialSourcePage String),
`Attendance` FixedString(16),
`CLID` UInt32,
`YCLID` UInt64,
`NormalizedRefererHash` UInt64,
`SearchPhraseHash` UInt64,
`RefererDomainHash` UInt64,
`NormalizedStartURLHash` UInt64,
`StartURLDomainHash` UInt64,
`NormalizedEndURLHash` UInt64,
`TopLevelDomain` UInt64,
`URLScheme` UInt64,
`OpenstatServiceNameHash` UInt64,
`OpenstatCampaignIDHash` UInt64,
`OpenstatAdIDHash` UInt64,
`OpenstatSourceIDHash` UInt64,
`UTMSourceHash` UInt64,
`UTMMediumHash` UInt64,
`UTMCampaignHash` UInt64,
`UTMContentHash` UInt64,
`UTMTermHash` UInt64,
`FromHash` UInt64,
`WebVisorEnabled` UInt8,
`WebVisorActivity` UInt32,
`ParsedParams` Nested(
Key1 String,
Key2 String,
Key3 String,
Key4 String,
Key5 String,
ValueDouble Float64),
`Market` Nested(
Type UInt8,
GoalID UInt32,
OrderID String,
OrderPrice Int64,
PP UInt32,
DirectPlaceID UInt32,
DirectOrderID UInt32,
DirectBannerID UInt32,
GoodID String,
GoodName String,
GoodQuantity Int32,
GoodPrice Int64),
`IslandID` FixedString(16)
)
ENGINE = CollapsingMergeTree(Sign)
PARTITION BY toYYYYMM(StartDate)
ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID)
SAMPLE BY intHash32(UserID);
执行 sample.sql 文件:
clickhouse-client --password="123456" --queries-file sample.sql
3. 导入数据
clickhouse-client --password="123456" --query "INSERT INTO tutorial.hits_v1 FORMAT TSV" --max_insert_block_size=100000 < hits_v1.tsv
clickhouse-client --password="123456" --query "INSERT INTO tutorial.visits_v1 FORMAT TSV" --max_insert_block_size=100000 < visits_v1.tsv
找出可用的设置、含义及其默认值的最简单方法是查询 system.settings 表:
vvml-yz-hbase-test.172.18.4.126 :) select name, value, changed, description
from system.settings
where name like '%max_insert_b%';
SELECT
name,
value,
changed,
description
FROM system.settings
WHERE name LIKE '%max_insert_b%'
Query id: 05bc6241-2d1f-432e-87b3-f35a3ad612c8
┌─name──────────────────┬─value───┬─changed─┬─description───────────────────────────────────────────────────────────────────────────────┐
│ max_insert_block_size │ 1048449 │ 0 │ The maximum block size for insertion, if we control the creation of blocks for insertion. │
└───────────────────────┴─────────┴─────────┴───────────────────────────────────────────────────────────────────────────────────────────┘
1 row in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.126 :)
4. 优化表
可以 OPTIMIZE 导入后的表。使用 MergeTree-family 引擎配置的表总是在后台合并数据部分以优化数据存储(或至少检查是否有意义)。这些查询强制表引擎立即进行存储优化(较慢,谨慎手工执行):
clickhouse-client --password="123456" --query "OPTIMIZE TABLE tutorial.hits_v1 FINAL"
clickhouse-client --password="123456" --query "OPTIMIZE TABLE tutorial.visits_v1 FINAL"
5. 查询示例
vvml-yz-hbase-test.172.18.4.126 :) SELECT
StartURL AS URL,
AVG(Duration) AS AvgDuration
FROM tutorial.visits_v1
WHERE StartDate BETWEEN '2014-03-23' AND '2014-03-30'
GROUP BY URL
ORDER BY AvgDuration DESC
LIMIT 10;
SELECT
StartURL AS URL,
AVG(Duration) AS AvgDuration
FROM tutorial.visits_v1
WHERE (StartDate >= '2014-03-23') AND (StartDate <= '2014-03-30')
GROUP BY URL
ORDER BY AvgDuration DESC
LIMIT 10
Query id: fc2e8638-5081-496a-964b-a679eab63af9
┌─URL─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─AvgDuration─┐
│ http://itpalanija-pri-patrivative=0&ads_app_user │ 60127 │
│ http://renaul-myd-ukraine │ 58938 │
│ http://karta/Futbol/dynamo.kiev.ua/kawaica.su/648 │ 56538 │
│ https://moda/vyikroforum1/top.ru/moscow/delo-product/trend_sms/multitryaset/news/2014/03/201000 │ 55218 │
│ http://e.mail=on&default?abid=2061&scd=yes&option?r=city_inter.com/menu&site-zaferio.ru/c/m.ensor.net/ru/login=false&orderStage.php?Brandidatamalystyle/20Mar2014%2F007%2F94dc8d2e06e56ed56bbdd │ 51378 │
│ http://karta/Futbol/dynas.com/haberler.ru/messages.yandsearchives/494503_lte_13800200319 │ 49078 │
│ http://xmusic/vstreatings of speeds │ 36925 │
│ http://news.ru/yandex.ru/api.php&api=http://toberria.ru/aphorizana │ 36902 │
│ http://bashmelnykh-metode.net/video/#!/video/emberkas.ru/detskij-yazi.com/iframe/default.aspx?id=760928&noreask=1&source │ 34323 │
│ http://censonhaber/547-popalientLog=0&strizhki-petro%3D&comeback=search?lr=213&text │ 31773 │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────┘
10 rows in set. Elapsed: 0.080 sec. Processed 1.43 million rows, 112.12 MB (17.91 million rows/s., 1.40 GB/s.)
Peak memory usage: 45.07 MiB.
vvml-yz-hbase-test.172.18.4.126 :) SELECT
sum(Sign) AS visits,
sumIf(Sign, has(Goals.ID, 1105530)) AS goal_visits,
(100. * goal_visits) / visits AS goal_percent
FROM tutorial.visits_v1
WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'yandex.ru');
SELECT
sum(Sign) AS visits,
sumIf(Sign, has(Goals.ID, 1105530)) AS goal_visits,
(100. * goal_visits) / visits AS goal_percent
FROM tutorial.visits_v1
WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'yandex.ru')
Query id: 02df86cc-ef97-4cec-9892-6ba92dc87d2c
┌─visits─┬─goal_visits─┬──────goal_percent─┐
│ 10543 │ 8553 │ 81.12491700654462 │
└────────┴─────────────┴───────────────────┘
1 row in set. Elapsed: 0.014 sec. Processed 19.72 thousand rows, 3.45 MB (1.46 million rows/s., 255.24 MB/s.)
Peak memory usage: 5.01 MiB.
vvml-yz-hbase-test.172.18.4.126 :)
三、集群部署
本次部署使用四台主机,构建两个分片,每个分片两个副本的 ClickHouse 集群,IP 和主机名如下:
172.18.4.126 node1
172.18.4.188 node2
172.18.4.71 node3
172.18.4.86 node4
注意,ClickHouse 要求每个分片的每个副本必须配置在单独的实例上,也就是说在整个集群范围内,一共有多少个副本,就需要创建多少个 ClickHouse 实例。最佳实践是3分片2副本6实例。
ClickHouse 推荐使用 ClickHouse Keeper 替代 Zookeeper(https://clickhouse.com/docs/knowledgebase/why_recommend_clickhouse_keeper_over_zookeeper)。
下面使用 ClickHouse Keeper 配置 ClickHouse 集群。具体操作步骤参考 ClickHouse Keeper 用户指南(https://clickhouse.com/docs/en/guides/sre/keeper/clickhouse-keeper#clickhouse-keeper-user-guide)。
ClickHouse 服务器中捆绑了 clickhouse-keeper。如果已经安装服务器,则无法单独安装 clickhouse-keeper,会收到冲突错误:
[root@vvml-yz-hbase-test~]#rpm -ivh clickhouse-keeper-24.1.8.22.x86_64.rpm
error: Failed dependencies:
clickhouse-server conflicts with clickhouse-keeper-0:24.1.8.22-1.x86_64
[root@vvml-yz-hbase-test~]#
但如果有仅用作 clickhouse-keeper 的服务器,则仅可以单独安装 clickhouse-keeper。
# 查看版本
https://github.com/ClickHouse/ClickHouse/releases
# 最新稳定版本安装包下载地址
https://github.com/ClickHouse/ClickHouse/releases/download/v24.1.8.22-stable/clickhouse-keeper-24.1.8.22.x86_64.rpm
本次部署在每个主机安装 clickhouse-server,而不单独安装 clickhouse-keeper。分片、副本、keeper规划如下:
node1 分片1副本1 keeper
node2 分片1副本2 keeper
node3 分片2副本1 keeper
node4 分片2副本2
四台主机都作为数据分片,每个数据分片有两个副本。keeper 部署到三个实例上,奇数实例用于实现 ClickHouse Keeper 中要求的票选数。
0. 安装前准备
在配置 ClickHouse 集群前,需要在全部四台机器上完成以下准备工作:
- 启动 NTP 时钟同步
- /etc/hosts 文件中添加构成集群的所有主机名
- 配置所有主机间 ssh 免密
- 修改用户可打开文件数与进程数
- 禁用防火墙
- 禁用 transparent hugepage
1. 安装配置 ClickHouse Keeper
在 node1、node2、node3 三台主机上执行下面的操作步骤。
(1)安装 ClickHouse Server 和 ClickHouse Client
rpm -ivh clickhouse-common-static-24.1.8.22.x86_64.rpm
rpm -ivh clickhouse-server-24.1.8.22.x86_64.rpm
rpm -ivh clickhouse-client-24.1.8.22.x86_64.rpm
(2)修改 ClickHouse 主配置文件
# 修改前先备份
cp /etc/clickhouse-server/config.xml /etc/clickhouse-server/config.xml.bak
修改 /etc/clickhouse-server/config.xml 主配置文件,在根节点 <clickhouse> 下添加以下内容:
<!-- 添加以下条目以允许通过网络接口进行外部通信。 -->
<listen_host>0.0.0.0</listen_host>
<!-- 指定实例启用 ClickHouse Keeper。更新每台服务器的<server_id>设置,node1为1、node2为2、node3为3。-->
<keeper_server>
<tcp_port>9181</tcp_port>
<server_id>1</server_id>
<log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
<snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
<coordination_settings>
<operation_timeout_ms>10000</operation_timeout_ms>
<session_timeout_ms>30000</session_timeout_ms>
<raft_logs_level>warning</raft_logs_level>
</coordination_settings>
<raft_configuration>
<server>
<id>1</id>
<hostname>node1</hostname>
<port>9234</port>
</server>
<server>
<id>2</id>
<hostname>node2</hostname>
<port>9234</port>
</server>
<server>
<id>3</id>
<hostname>node3</hostname>
<port>9234</port>
</server>
</raft_configuration>
</keeper_server>
<!-- 指定实例所使用的 ClickHouse Keeper -->
<zookeeper>
<node>
<host>node1</host>
<port>9181</port>
</node>
<node>
<host>node2</host>
<port>9181</port>
</node>
<node>
<host>node3</host>
<port>9181</port>
</node>
</zookeeper>
(3)重启 ClickHouse
sudo service clickhouse-server restart
(4)验证 Keeper 实例是否正在运行
在 node1、node2、node3 上执行下面的命令,如果 Keeper 运行正常,ruok 命令将返回 imok:
[root@vvml-yz-hbase-test~]#echo ruok | nc localhost 9181; echo
imok
[root@vvml-yz-hbase-test~]#
(5)确认 zookeeper 系统表
系统数据库有一个名为 zookeeper 的表,其中包含 ClickHouse Keeper 实例的详细信息:
[root@vvml-yz-hbase-test~]#clickhouse-client --password="123456"
ClickHouse client version 24.1.8.22 (official build).
Connecting to localhost:9000 as user default.
Connected to ClickHouse server version 24.1.8.
vvml-yz-hbase-test.172.18.4.126 :) SELECT *
FROM system.zookeeper
WHERE path IN ('/', '/clickhouse');
SELECT *
FROM system.zookeeper
WHERE path IN ('/', '/clickhouse')
Query id: e713c446-26c5-4c3f-994c-db22eb68b9ad
┌─name───────┬─value─┬─path────────┐
│ keeper │ │ / │
│ clickhouse │ │ / │
│ task_queue │ │ /clickhouse │
│ sessions │ │ /clickhouse │
└────────────┴───────┴─────────────┘
4 rows in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.126 :)
2. 配置 ClickHouse 集群
(1)在新主机(node4)上安装 ClickHouse Server 和 ClickHouse Client
rpm -ivh clickhouse-common-static-24.1.8.22.x86_64.rpm
rpm -ivh clickhouse-server-24.1.8.22.x86_64.rpm
rpm -ivh clickhouse-client-24.1.8.22.x86_64.rpm
(2)修改新主机(node4)上的配置文件
# 修改前先备份
cp /etc/clickhouse-server/config.xml /etc/clickhouse-server/config.xml.bak
修改 /etc/clickhouse-server/config.xml 主配置文件,在根节点 <clickhouse> 下添加以下内容:
<!-- 添加以下条目以允许通过网络接口进行外部通信。 -->
<listen_host>0.0.0.0</listen_host>
<!-- 指定实例所使用的 ClickHouse Keeper -->
<zookeeper>
<node>
<host>node1</host>
<port>9181</port>
</node>
<node>
<host>node2</host>
<port>9181</port>
</node>
<node>
<host>node3</host>
<port>9181</port>
</node>
</zookeeper>
然后在全部四台主机上执行下面的操作步骤。
(3)更新配置
修改 /etc/clickhouse-server/config.xml 主配置文件,在根节点 <clickhouse> 下添加以下内容:
<remote_servers>
<cluster_2S_2R>
<shard>
<replica>
<host>node1</host>
<port>9000</port>
<user>default</user>
<password>123456</password>
</replica>
<replica>
<host>node2</host>
<port>9000</port>
<user>default</user>
<password>123456</password>
</replica>
</shard>
<shard>
<replica>
<host>node3</host>
<port>9000</port>
<user>default</user>
<password>123456</password>
</replica>
<replica>
<host>node4</host>
<port>9000</port>
<user>default</user>
<password>123456</password>
</replica>
</shard>
</cluster_2S_2R>
</remote_servers>
(4)重启 ClickHouse 并验证集群已创建
sudo service clickhouse-server restart
查看集群:
[root@vvml-yz-hbase-test~]#clickhouse-client --password="123456"
ClickHouse client version 24.1.8.22 (official build).
Connecting to localhost:9000 as user default.
Connected to ClickHouse server version 24.1.8.
vvml-yz-hbase-test.172.18.4.126 :) SHOW CLUSTERS;
SHOW CLUSTERS
Query id: 80e81978-d1f4-4721-85d8-7e7803230373
┌─cluster───────┐
│ cluster_2S_2R │
└───────────────┘
1 row in set. Elapsed: 0.001 sec.
vvml-yz-hbase-test.172.18.4.126 :) select cluster,shard_num,replica_num,host_name,is_local,user,database_shard_name,database_replica_name from system.clusters;
SELECT
cluster,
shard_num,
replica_num,
host_name,
is_local,
user,
database_shard_name,
database_replica_name
FROM system.clusters
Query id: 4b39d9ec-b4f7-4557-b76d-05f3893f4ef7
┌─cluster───────┬─shard_num─┬─replica_num─┬─host_name─┬─is_local─┬─user────┬─database_shard_name─┬─database_replica_name─┐
│ cluster_2S_2R │ 1 │ 1 │ node1 │ 1 │ default │ │ │
│ cluster_2S_2R │ 1 │ 2 │ node2 │ 0 │ default │ │ │
│ cluster_2S_2R │ 2 │ 1 │ node3 │ 0 │ default │ │ │
│ cluster_2S_2R │ 2 │ 2 │ node4 │ 0 │ default │ │ │
└───────────────┴───────────┴─────────────┴───────────┴──────────┴─────────┴─────────────────────┴───────────────────────┘
4 rows in set. Elapsed: 0.001 sec.
vvml-yz-hbase-test.172.18.4.126 :)
3. 创建分布式表
(1)创建数据库
使用 node1 上创建一个数据库。ON CLUSTER子句会自动在所有实例上创建数据库。
vvml-yz-hbase-test.172.18.4.126 :) CREATE DATABASE db1 ON CLUSTER 'cluster_2S_2R';
CREATE DATABASE db1 ON CLUSTER cluster_2S_2R
Query id: 7a8cd789-bcfb-4855-a131-ba7935cffcfb
┌─host──┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐
│ node1 │ 9000 │ 0 │ │ 3 │ 0 │
│ node3 │ 9000 │ 0 │ │ 2 │ 0 │
│ node4 │ 9000 │ 0 │ │ 1 │ 0 │
│ node2 │ 9000 │ 0 │ │ 0 │ 0 │
└───────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘
4 rows in set. Elapsed: 0.099 sec.
vvml-yz-hbase-test.172.18.4.126 :)
(2)创建本地表
在 db1 库中建表,同样,ON CLUSTER 子句会自动在所有实例上建表。
vvml-yz-hbase-test.172.18.4.126 :) CREATE TABLE db1.table1 on cluster 'cluster_2S_2R'
(
`id` UInt64,
`column1` String
)
ENGINE = MergeTree
ORDER BY column1;
CREATE TABLE db1.table1 ON CLUSTER cluster_2S_2R
(
`id` UInt64,
`column1` String
)
ENGINE = MergeTree
ORDER BY column1
Query id: abb936ad-3618-4821-92f6-cfaa83fb4d51
┌─host──┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐
│ node3 │ 9000 │ 0 │ │ 3 │ 2 │
│ node1 │ 9000 │ 0 │ │ 2 │ 2 │
└───────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘
┌─host──┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐
│ node4 │ 9000 │ 0 │ │ 1 │ 0 │
│ node2 │ 9000 │ 0 │ │ 0 │ 0 │
└───────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘
4 rows in set. Elapsed: 0.158 sec.
vvml-yz-hbase-test.172.18.4.126 :)
(3)在一个分片实例上(node1)新增两行
vvml-yz-hbase-test.172.18.4.126 :) INSERT INTO db1.table1
(id, column1)
VALUES
(1, 'abc'),
(2, 'def');
INSERT INTO db1.table1 (id, column1) FORMAT Values
Query id: 959da99c-c473-4c3a-9381-fa65b447161c
Ok.
2 rows in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.126 :)
(4)在另一个分片实例上(node3)新增两行
vvml-yz-hbase-test.172.18.4.71 :) INSERT INTO db1.table1
(id, column1)
VALUES
(3, 'ghi'),
(4, 'jkl');
INSERT INTO db1.table1 (id, column1) FORMAT Values
Query id: c8864197-ec82-4aba-8c2e-aaa2af468553
Ok.
2 rows in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.71 :)
(5)在所有实例上分别执行 SELECT 查询
node1、node3 上查询结果显示该本地实例上的两行数据:
# node1
vvml-yz-hbase-test.172.18.4.126 :) SELECT *
FROM db1.table1;
SELECT *
FROM db1.table1
Query id: 2f2fc679-9091-41ae-967b-4bd8e2ec7311
┌─id─┬─column1─┐
│ 1 │ abc │
│ 2 │ def │
└────┴─────────┘
2 rows in set. Elapsed: 0.001 sec.
vvml-yz-hbase-test.172.18.4.126 :)
# node3
vvml-yz-hbase-test.172.18.4.71 :) SELECT *
FROM db1.table1;
SELECT *
FROM db1.table1
Query id: 18843522-c678-45f0-901d-73e1bbfd4dbf
┌─id─┬─column1─┐
│ 3 │ ghi │
│ 4 │ jkl │
└────┴─────────┘
2 rows in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.71 :)
node2、node4 上查不到数据,说明写入本地表并没有复制到其副本。
(6)创建分布式表
可以创建一个分布式表来表示两个分片上的数据。具有分布式表引擎的表不存储自己的任何数据,而是允许在多个服务器上进行分布式查询处理。读取命中所有分片,写入可以分布在分片之间。在任一实例上创建分布式表:
vvml-yz-hbase-test.172.18.4.126 :) CREATE TABLE db1.dist_table ON CLUSTER 'cluster_2S_2R'
(
id UInt64,
column1 String
)
ENGINE = Distributed(cluster_2S_2R,db1,table1);
CREATE TABLE db1.dist_table ON CLUSTER cluster_2S_2R
(
`id` UInt64,
`column1` String
)
ENGINE = Distributed(cluster_2S_2R, db1, table1)
Query id: 7c08e756-90cf-4014-9368-dc41fe7d06f4
┌─host──┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐
│ node4 │ 9000 │ 0 │ │ 3 │ 0 │
│ node2 │ 9000 │ 0 │ │ 2 │ 0 │
│ node1 │ 9000 │ 0 │ │ 1 │ 0 │
│ node3 │ 9000 │ 0 │ │ 0 │ 0 │
└───────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘
4 rows in set. Elapsed: 0.095 sec.
vvml-yz-hbase-test.172.18.4.126 :)
(7)在所有实例上分别执行 SELECT 查询
所有实例上都创建了 db1.dist_table 表,但是 SELECT *
FROM db1.dist_table 查询结果却出乎意料。预想在所有实例上查询的结果相同,都返回两个分片中的所有四行数据,而实际四个实例返回的结果是不确定的:node1 有时返回全部4行,有时只返回前两行;node3 有时返回全部4行,有时只返回后两行;node2 有时返回后两行,有时不返回数据;node4 有时返回前两行,有时不返回数据。
说明:
- ClickHouse 集群是一种对等架构,在一个集群里每个 ClickHouse 实例都是独立的,即使是同一个分片内的不同副本实例间,也是没有主从概念。
- 在集群中任何一个实例上执行的 DDL 语句中使用 ON CLUSTER 子句,会自动在集群中的所有实例上执行。
- 分布式表实际上是一种视图,映射到 ClickHouse 集群实例上的本地表。从分布式表中执行 SELECT 查询会使用集群所有相关分片的资源。
- 底层本地表如果使用 MergeTree 表引擎,在分布式表上执行同一查询,返回结果有可能不确定。
4. 数据自动分片
首先想到的是如果在分布式表上能够执行 DML 语句,就应该可以达到数据自动分片的效果。
vvml-yz-hbase-test.172.18.4.126 :) insert into db1.dist_table
(id, column1)
values
(5, 'mno'),
(6, 'pqr');
INSERT INTO db1.dist_table (id, column1) FORMAT Values
Query id: 78b4cc88-9b13-4078-a3f0-d405f4338cfb
Elapsed: 0.002 sec.
Received exception from server (version 24.1.8):
Code: 55. DB::Exception: Received from localhost:9000. DB::Exception: Method write is not supported by storage Distributed with more than one shard and no sharding key provided. (STORAGE_REQUIRES_PARAMETER)
vvml-yz-hbase-test.172.18.4.126 :)
报错明确指出,当数据存储分布在多于一个的分片上,并且没有提供分片键时,分布式表不支持写入。为了进一步演示,下面使用和创建 hits_v1 表类似的 CREATE TABLE 语句创建一个新的本地表,有三点不同:
- 库名不同。
- 表名不同。
- 使用 ON CLUSTER 子句。
(1)创建带有分片键的本地表
在任一实例上执行下面的建表语句:
CREATE TABLE db1.hits_local ON CLUSTER 'cluster_2S_2R'
(
`WatchID` UInt64,
`JavaEnable` UInt8,
`Title` String,
`GoodEvent` Int16,
`EventTime` DateTime,
`EventDate` Date,
`CounterID` UInt32,
`ClientIP` UInt32,
`ClientIP6` FixedString(16),
`RegionID` UInt32,
`UserID` UInt64,
`CounterClass` Int8,
`OS` UInt8,
`UserAgent` UInt8,
`URL` String,
`Referer` String,
`URLDomain` String,
`RefererDomain` String,
`Refresh` UInt8,
`IsRobot` UInt8,
`RefererCategories` Array(UInt16),
`URLCategories` Array(UInt16),
`URLRegions` Array(UInt32),
`RefererRegions` Array(UInt32),
`ResolutionWidth` UInt16,
`ResolutionHeight` UInt16,
`ResolutionDepth` UInt8,
`FlashMajor` UInt8,
`FlashMinor` UInt8,
`FlashMinor2` String,
`NetMajor` UInt8,
`NetMinor` UInt8,
`UserAgentMajor` UInt16,
`UserAgentMinor` FixedString(2),
`CookieEnable` UInt8,
`JavascriptEnable` UInt8,
`IsMobile` UInt8,
`MobilePhone` UInt8,
`MobilePhoneModel` String,
`Params` String,
`IPNetworkID` UInt32,
`TraficSourceID` Int8,
`SearchEngineID` UInt16,
`SearchPhrase` String,
`AdvEngineID` UInt8,
`IsArtifical` UInt8,
`WindowClientWidth` UInt16,
`WindowClientHeight` UInt16,
`ClientTimeZone` Int16,
`ClientEventTime` DateTime,
`SilverlightVersion1` UInt8,
`SilverlightVersion2` UInt8,
`SilverlightVersion3` UInt32,
`SilverlightVersion4` UInt16,
`PageCharset` String,
`CodeVersion` UInt32,
`IsLink` UInt8,
`IsDownload` UInt8,
`IsNotBounce` UInt8,
`FUniqID` UInt64,
`HID` UInt32,
`IsOldCounter` UInt8,
`IsEvent` UInt8,
`IsParameter` UInt8,
`DontCountHits` UInt8,
`WithHash` UInt8,
`HitColor` FixedString(1),
`UTCEventTime` DateTime,
`Age` UInt8,
`Sex` UInt8,
`Income` UInt8,
`Interests` UInt16,
`Robotness` UInt8,
`GeneralInterests` Array(UInt16),
`RemoteIP` UInt32,
`RemoteIP6` FixedString(16),
`WindowName` Int32,
`OpenerName` Int32,
`HistoryLength` Int16,
`BrowserLanguage` FixedString(2),
`BrowserCountry` FixedString(2),
`SocialNetwork` String,
`SocialAction` String,
`HTTPError` UInt16,
`SendTiming` Int32,
`DNSTiming` Int32,
`ConnectTiming` Int32,
`ResponseStartTiming` Int32,
`ResponseEndTiming` Int32,
`FetchTiming` Int32,
`RedirectTiming` Int32,
`DOMInteractiveTiming` Int32,
`DOMContentLoadedTiming` Int32,
`DOMCompleteTiming` Int32,
`LoadEventStartTiming` Int32,
`LoadEventEndTiming` Int32,
`NSToDOMContentLoadedTiming` Int32,
`FirstPaintTiming` Int32,
`RedirectCount` Int8,
`SocialSourceNetworkID` UInt8,
`SocialSourcePage` String,
`ParamPrice` Int64,
`ParamOrderID` String,
`ParamCurrency` FixedString(3),
`ParamCurrencyID` UInt16,
`GoalsReached` Array(UInt32),
`OpenstatServiceName` String,
`OpenstatCampaignID` String,
`OpenstatAdID` String,
`OpenstatSourceID` String,
`UTMSource` String,
`UTMMedium` String,
`UTMCampaign` String,
`UTMContent` String,
`UTMTerm` String,
`FromTag` String,
`HasGCLID` UInt8,
`RefererHash` UInt64,
`URLHash` UInt64,
`CLID` UInt32,
`YCLID` UInt64,
`ShareService` String,
`ShareURL` String,
`ShareTitle` String,
`ParsedParams` Nested(
Key1 String,
Key2 String,
Key3 String,
Key4 String,
Key5 String,
ValueDouble Float64),
`IslandID` FixedString(16),
`RequestNum` UInt32,
`RequestTry` UInt8
)
ENGINE = MergeTree()
PARTITION BY toYYYYMM(EventDate)
ORDER BY (CounterID, EventDate, intHash32(UserID))
SAMPLE BY intHash32(UserID);
(2)创建提供集群本地表视图的分布式表
在任一实例上执行下面的建表语句:
CREATE TABLE db1.hits_all ON CLUSTER 'cluster_2S_2R'
AS db1.hits_local
ENGINE = Distributed(cluster_2S_2R, db1, hits_local, rand());
(3)向分布式表中插入数据
在任一实例上执行下面的 insert 语句:
INSERT INTO db1.hits_all SELECT * FROM tutorial.hits_v1;
(4)在所有实例上分别执行 SELECT 查询
# node1
vvml-yz-hbase-test.172.18.4.126 :) select count(*) from db1.hits_local;
SELECT count(*)
FROM db1.hits_local
Query id: 10a10594-dad1-4693-96fa-a27e62256cc0
┌─count()─┐
│ 4437894 │
└─────────┘
1 row in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.126 :) select count(*) from db1.hits_all;
SELECT count(*)
FROM db1.hits_all
Query id: 4d825459-19fa-4f68-9326-cf6a9b7e113a
┌─count()─┐
│ 8873898 │
└─────────┘
1 row in set. Elapsed: 0.006 sec.
vvml-yz-hbase-test.172.18.4.126 :)
# node2
vvml-yz-hbase-test.172.18.4.188 :) select count(*) from db1.hits_local;
SELECT count(*)
FROM db1.hits_local
Query id: 7da679f4-4f0f-42e5-8c78-df6c9814c61f
┌─count()─┐
│ 4437894 │
└─────────┘
1 row in set. Elapsed: 0.003 sec.
vvml-yz-hbase-test.172.18.4.188 :) select count(*) from db1.hits_all;
SELECT count(*)
FROM db1.hits_all
Query id: 29594d92-e38e-473a-bc91-7a5fb37e0647
┌─count()─┐
│ 8873898 │
└─────────┘
1 row in set. Elapsed: 0.005 sec.
vvml-yz-hbase-test.172.18.4.188 :)
# node3
vvml-yz-hbase-test.172.18.4.71 :) select count(*) from db1.hits_local;
SELECT count(*)
FROM db1.hits_local
Query id: 2e5a0fca-9b13-4c4e-8cb9-f9ee60da6152
┌─count()─┐
│ 4436004 │
└─────────┘
1 row in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.71 :) select count(*) from db1.hits_all;
SELECT count(*)
FROM db1.hits_all
Query id: 85e6e7a4-eaab-4455-8c29-52ec0998a9b0
┌─count()─┐
│ 8873898 │
└─────────┘
1 row in set. Elapsed: 0.006 sec.
vvml-yz-hbase-test.172.18.4.71 :)
# node4
vvml-yz-hbase-test.172.18.4.86 :) select count(*) from db1.hits_local;
SELECT count(*)
FROM db1.hits_local
Query id: 9950210e-a4a4-4e1d-a449-4aefb2ea396f
┌─count()─┐
│ 4436004 │
└─────────┘
1 row in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.86 :) select count(*) from db1.hits_all;
SELECT count(*)
FROM db1.hits_all
Query id: 45777141-ff08-452f-93b8-1650265d4175
┌─count()─┐
│ 8873898 │
└─────────┘
1 row in set. Elapsed: 0.009 sec.
vvml-yz-hbase-test.172.18.4.86 :)
从查询结果可以看到,本地表 node1、node2 上的查询结果相同,node3、node4 上的查询结果相同,分布式表四个节点查询结果都相同。node1 和 node3 分属于集群中的不同分片,而在创建分布式表时指定的分布规则随机,所以这两个实例上的本地表数据量存在少许差异是符合预期的。结论是:
- 指定分片键的分布式表可以写数据。
- 数据按创建分布式表时指定的自定义的分片规则分布。
- select count(*) from tutorial.hits_v1 用了2毫秒,select count(*) from db1.hits_all 用了6毫秒,在本测试环境中,查询分布式表比查询本地表慢了两倍。
- 底层表如果使用 MergeTree 表引擎,其上正常定义了分布规则的分布式表,插入分布式表可以自动在同一分片上的不同副本之间做数据同步。
5. 使用 ReplicatedMergeTree 表引擎复制数据
按通常的理解,一个分布式数据库至少需要满足以下基本需求:
- 数据按指定规则自动分片
- 同一分片的多个副本数据保持一致
- 数据分片和多副本间的数据同步对应用透明。
ClickHouse 推荐的配置是使用 ReplicatedMergeTree 表引擎,自动完成副本间的数据复制。创建 ReplcatedMergeTree 表,通常需要设置宏来识别每个用于创建表的分片和副本。
(1)定义宏
在全部四台主机上执行下面的操作步骤。
修改 /etc/clickhouse-server/config.xml 主配置文件,在每个节点 <shard> 下添加 <internal_replication>true</internal_replication>:
<remote_servers>
<cluster_2S_2R>
<shard>
<internal_replication>true</internal_replication>
<replica>
<host>node1</host>
<port>9000</port>
<user>default</user>
<password>123456</password>
</replica>
<replica>
<host>node2</host>
<port>9000</port>
<user>default</user>
<password>123456</password>
</replica>
</shard>
<shard>
<internal_replication>true</internal_replication>
<replica>
<host>node3</host>
<port>9000</port>
<user>default</user>
<password>123456</password>
</replica>
<replica>
<host>node4</host>
<port>9000</port>
<user>default</user>
<password>123456</password>
</replica>
</shard>
</cluster_2S_2R>
</remote_servers>
在节点 </remote_servers> 后面添加以下内容:
<macros>
<shard>01</shard>
<replica>01</replica>
</macros>
node1 配置成 01、01;node2 配置成 01、02;node3 配置成 02、01;node4 配置成 02、02。
然后重启所有实例:
sudo service clickhouse-server restart
(2)创建本地表
在任一实例上执行下面的建表语句:
CREATE TABLE db1.hits_replica ON CLUSTER 'cluster_2S_2R'
(
`WatchID` UInt64,
`JavaEnable` UInt8,
`Title` String,
`GoodEvent` Int16,
`EventTime` DateTime,
`EventDate` Date,
`CounterID` UInt32,
`ClientIP` UInt32,
`ClientIP6` FixedString(16),
`RegionID` UInt32,
`UserID` UInt64,
`CounterClass` Int8,
`OS` UInt8,
`UserAgent` UInt8,
`URL` String,
`Referer` String,
`URLDomain` String,
`RefererDomain` String,
`Refresh` UInt8,
`IsRobot` UInt8,
`RefererCategories` Array(UInt16),
`URLCategories` Array(UInt16),
`URLRegions` Array(UInt32),
`RefererRegions` Array(UInt32),
`ResolutionWidth` UInt16,
`ResolutionHeight` UInt16,
`ResolutionDepth` UInt8,
`FlashMajor` UInt8,
`FlashMinor` UInt8,
`FlashMinor2` String,
`NetMajor` UInt8,
`NetMinor` UInt8,
`UserAgentMajor` UInt16,
`UserAgentMinor` FixedString(2),
`CookieEnable` UInt8,
`JavascriptEnable` UInt8,
`IsMobile` UInt8,
`MobilePhone` UInt8,
`MobilePhoneModel` String,
`Params` String,
`IPNetworkID` UInt32,
`TraficSourceID` Int8,
`SearchEngineID` UInt16,
`SearchPhrase` String,
`AdvEngineID` UInt8,
`IsArtifical` UInt8,
`WindowClientWidth` UInt16,
`WindowClientHeight` UInt16,
`ClientTimeZone` Int16,
`ClientEventTime` DateTime,
`SilverlightVersion1` UInt8,
`SilverlightVersion2` UInt8,
`SilverlightVersion3` UInt32,
`SilverlightVersion4` UInt16,
`PageCharset` String,
`CodeVersion` UInt32,
`IsLink` UInt8,
`IsDownload` UInt8,
`IsNotBounce` UInt8,
`FUniqID` UInt64,
`HID` UInt32,
`IsOldCounter` UInt8,
`IsEvent` UInt8,
`IsParameter` UInt8,
`DontCountHits` UInt8,
`WithHash` UInt8,
`HitColor` FixedString(1),
`UTCEventTime` DateTime,
`Age` UInt8,
`Sex` UInt8,
`Income` UInt8,
`Interests` UInt16,
`Robotness` UInt8,
`GeneralInterests` Array(UInt16),
`RemoteIP` UInt32,
`RemoteIP6` FixedString(16),
`WindowName` Int32,
`OpenerName` Int32,
`HistoryLength` Int16,
`BrowserLanguage` FixedString(2),
`BrowserCountry` FixedString(2),
`SocialNetwork` String,
`SocialAction` String,
`HTTPError` UInt16,
`SendTiming` Int32,
`DNSTiming` Int32,
`ConnectTiming` Int32,
`ResponseStartTiming` Int32,
`ResponseEndTiming` Int32,
`FetchTiming` Int32,
`RedirectTiming` Int32,
`DOMInteractiveTiming` Int32,
`DOMContentLoadedTiming` Int32,
`DOMCompleteTiming` Int32,
`LoadEventStartTiming` Int32,
`LoadEventEndTiming` Int32,
`NSToDOMContentLoadedTiming` Int32,
`FirstPaintTiming` Int32,
`RedirectCount` Int8,
`SocialSourceNetworkID` UInt8,
`SocialSourcePage` String,
`ParamPrice` Int64,
`ParamOrderID` String,
`ParamCurrency` FixedString(3),
`ParamCurrencyID` UInt16,
`GoalsReached` Array(UInt32),
`OpenstatServiceName` String,
`OpenstatCampaignID` String,
`OpenstatAdID` String,
`OpenstatSourceID` String,
`UTMSource` String,
`UTMMedium` String,
`UTMCampaign` String,
`UTMContent` String,
`UTMTerm` String,
`FromTag` String,
`HasGCLID` UInt8,
`RefererHash` UInt64,
`URLHash` UInt64,
`CLID` UInt32,
`YCLID` UInt64,
`ShareService` String,
`ShareURL` String,
`ShareTitle` String,
`ParsedParams` Nested(
Key1 String,
Key2 String,
Key3 String,
Key4 String,
Key5 String,
ValueDouble Float64),
`IslandID` FixedString(16),
`RequestNum` UInt32,
`RequestTry` UInt8
)
ENGINE = ReplicatedMergeTree(
'/clickhouse/tables/{shard}/hits',
'{replica}'
)
PARTITION BY toYYYYMM(EventDate)
ORDER BY (CounterID, EventDate, intHash32(UserID))
SAMPLE BY intHash32(UserID);
(3)创建分布式表
在任一实例上执行下面的建表语句:
CREATE TABLE db1.hits_replica_all ON CLUSTER 'cluster_2S_2R'
AS db1.hits_replica
ENGINE = Distributed(cluster_2S_2R, db1, hits_replica, rand());
(4)向分布式表中插入数据
在任一实例上执行下面的 insert 语句:
INSERT INTO db1.hits_replica_all SELECT * FROM tutorial.hits_v1;
(5)在所有实例上分别执行 SELECT 查询
# node1
vvml-yz-hbase-test.172.18.4.126 :) select count(*) from db1.hits_replica;
SELECT count(*)
FROM db1.hits_replica
Query id: e4f08a9c-39be-48cf-a8f9-6caaa98b9fed
┌─count()─┐
│ 4438089 │
└─────────┘
1 row in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.126 :) select count(*) from db1.hits_replica_all;
SELECT count(*)
FROM db1.hits_replica_all
Query id: 765f0ea5-a199-4bfb-85c4-6fd31e23f1af
┌─count()─┐
│ 8873898 │
└─────────┘
1 row in set. Elapsed: 0.007 sec.
vvml-yz-hbase-test.172.18.4.126 :)
# node2
vvml-yz-hbase-test.172.18.4.188 :) select count(*) from db1.hits_replica;
SELECT count(*)
FROM db1.hits_replica
Query id: b464df00-4c1a-4fea-8f2f-70fd0c32569f
┌─count()─┐
│ 4438089 │
└─────────┘
1 row in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.188 :) select count(*) from db1.hits_replica_all;
SELECT count(*)
FROM db1.hits_replica_all
Query id: 19a77125-7e7e-4654-bb90-8884c3b0ed26
┌─count()─┐
│ 8873898 │
└─────────┘
1 row in set. Elapsed: 0.007 sec.
vvml-yz-hbase-test.172.18.4.188 :)
# node3
vvml-yz-hbase-test.172.18.4.71 :) select count(*) from db1.hits_replica;
SELECT count(*)
FROM db1.hits_replica
Query id: e021f13c-b980-4297-9c26-b94465503101
┌─count()─┐
│ 4435809 │
└─────────┘
1 row in set. Elapsed: 0.003 sec.
vvml-yz-hbase-test.172.18.4.71 :) select count(*) from db1.hits_replica_all;
SELECT count(*)
FROM db1.hits_replica_all
Query id: 80ebbda2-be03-445c-8c16-f67428a45572
┌─count()─┐
│ 8873898 │
└─────────┘
1 row in set. Elapsed: 0.005 sec.
vvml-yz-hbase-test.172.18.4.71 :)
# node4
vvml-yz-hbase-test.172.18.4.86 :) select count(*) from db1.hits_replica;
SELECT count(*)
FROM db1.hits_replica
Query id: 204f483d-3fcb-4758-8df4-20e79f78e42e
┌─count()─┐
│ 4435809 │
└─────────┘
1 row in set. Elapsed: 0.002 sec.
vvml-yz-hbase-test.172.18.4.86 :) select count(*) from db1.hits_replica_all;
SELECT count(*)
FROM db1.hits_replica_all
Query id: e3d28b81-50b7-46e6-a855-1b40ef9fe42a
┌─count()─┐
│ 8873898 │
└─────────┘
1 row in set. Elapsed: 0.007 sec.
vvml-yz-hbase-test.172.18.4.86 :)
从查询结果可以看到,本地表 node1、node2 上的查询结果相同,node3、node4 上的查询结果相同,分布式表四个节点查询结果都相同。node1 和 node3 分属于集群中的不同分片,而在创建分布式表时指定的分布规则随机,所以这两个实例上的本地表数据量存在少许差异是符合预期的。ReplicatedMergeTree 表可以自动在同一分片的不同副本间同步数据。