Ceph CRUSH Pipeline

based on version nautilus (b0c68711039276c1e8d5bfa838207468a36a165c)

Intuition

Image above explained pipeline for default RBD files, no pool related details are shown.

Algorithm CRUSH placement for object $x$

\[\begin{aligned} 1:\ &\hspace{0em} \textbf{procedure} \ \text{TAKE}(\alpha) &\text{//将项}\ \alpha\ \text{放入工作列表}\ \vec{i}\ \text{中} \\ 2:\ &\hspace{2em} \vec{i} \leftarrow \alpha \\ 3:\ &\hspace{0em} \textbf{end procedure} \\ \\ 4:\ &\hspace{0em} \textbf{procedure} \ \text{SELECT}(n,t) &\text{//选择}\ n\ \text{个类型为}\ t\ \text{的项目} \\ 5:\ &\hspace{2em} \vec{o} \leftarrow \emptyset &\text{//初始化输出变量为空} \\ 6:\ &\hspace{2em} \textbf{for}\ i \in \vec{i}\ \textbf{do} &\text{//遍历输入变量}\ \vec{i} \\ 7:\ &\hspace{4em} f \leftarrow 0 &\text{//目前没有操作失败} \\ 8:\ &\hspace{4em} \textbf{for}\ r \leftarrow 1, n\ \textbf{do} &\text{//遍历}\ n\ \text{个副本} \\ 9:\ &\hspace{6em} f_r \leftarrow 0 &\text{//该副本上操作还未遇到失败} \\ 10:\ &\hspace{6em} retry\_descent \leftarrow false \\ 11:\ &\hspace{6em} \textbf{repeat} \\ 12:\ &\hspace{8em} b \leftarrow bucket(i) &\text{//从桶}\ i\ \text{开始递归} \\ 13:\ &\hspace{8em} retry\_bucket \leftarrow false \\ 14:\ &\hspace{8em} \textbf{repeat} \\ 15:\ &\hspace{10em} \textbf{if}\ \text{``first n"}\ \textbf{then} &\text{//见 3.2.2 节} \\ 16:\ &\hspace{12em} r' \leftarrow r + f \\ 17:\ &\hspace{10em} \textbf{else} \\ 18:\ &\hspace{12em} r' \leftarrow r + f_r n \\ 19:\ &\hspace{10em} \textbf{end if} \\ 20:\ &\hspace{10em} o \leftarrow b.c(r',x) &\text{//见 3.4 节} \\ 21:\ &\hspace{10em} \textbf{if}\ type(o) \neq t\ \textbf{then} \\ 22:\ &\hspace{12em} b \leftarrow bucket(o) &\text{//继续递归} \\ 23:\ &\hspace{12em} retry\_bucket \leftarrow true \\ 24:\ &\hspace{10em} \textbf{else if}\ o \in \vec{o}\ \text{or}\ failed(o)\ \text{or}\ overload(o,x)\ \textbf{then} \\ 25:\ &\hspace{12em} f_r \leftarrow f_r + 1, f \leftarrow f + 1 \\ 26:\ &\hspace{12em} \textbf{if}\ o \in \vec{o}\ \text{and}\ f_r \lt 3\ \textbf{then} \\ &\hspace{14em} \text{//在当前桶内重试以解决冲突（见 3.2.1 节）} \\ 27:\ &\hspace{14em} retry\_bucket \leftarrow true \\ 28:\ &\hspace{12em} \textbf{else} \\ 29:\ &\hspace{14em} retry\_descent \leftarrow true &\text{//否则从}\ i\ \text{开始重新递归} \\ 30:\ &\hspace{12em} \textbf{end if} \\ 31:\ &\hspace{10em} \textbf{end if} \\ 32:\ &\hspace{8em} \textbf{until}\ \neg retry\_bucket \\ 33:\ &\hspace{6em} \textbf{until}\ \neg retry\_descent \\ 34:\ &\hspace{6em} \vec{o} \leftarrow [\vec{o},o] &\text{//将}\ o\ \text{添加到输出}\ \vec{o}\ \text{中} \\ 35:\ &\hspace{4em} \textbf{end for} \\ 36:\ &\hspace{2em} \textbf{end for} \\ 37:\ &\hspace{0em} \vec{i} \leftarrow \vec{o} &\text{//将输出拷贝回}\ \vec{i}\ \text{中} \\ 38:\ &\hspace{0em} \textbf{end procedure} \\ \\ 39:\ &\hspace{0em} \textbf{procedure}\ \text{EMIT} &\text{//将工作列表}\ \vec{i}\ \text{追加到结果中} \\ 40:\ &\hspace{2em} \vec{R} \leftarrow [\vec{R},\vec{i}] \\ 41:\ &\hspace{0em} \textbf{end procedure} \\ \end{aligned}\] \[\begin{aligned} E(d_\text{cached}) =\ &p_\text{hit} \big( E(d_\text{ssd}) + E(d_\text{network}) \big) \\ &+ (1 - p_\text{hit}) \underbrace{\big( E(d_\text{hdd}) + E_{c,\text{ssd}}(d_\text{dist}) \big)}_\text{promote lat.} \\ &+ (1 - p_\text{hit}) p_\text{write} \underbrace{\big( E(d_\text{ssd}) + E_{b,\text{hdd}}(d_\text{dist}) \big)}_\text{flush lat.} \\ &+ d_\text{software} \\ \text{s.t.} \hspace{1em} E_{r,\text{type}} (d_\text{dist}) \approx\ &E(\max_r d_\text{network}) + E(\max_r d_\text{type}), \text{type} \in \{\text{ssd}, \text{hdd}\} \\ \\ E(t_\text{cached}|_r) =\ &r (1 - p_\text{hit}) (1 + p_\text{write}) E(t_\text{IO}) \end{aligned}\]

For a primary-aligned modification,

\[\begin{aligned} E(d_\text{cached}) =\ &p_\text{hit} \big( E(d_\text{ssd}) + E(d_\text{network}) \big) \\ &+ (1 - p_\text{hit}) \underbrace{\big( E(d_\text{hdd}) + E_{c,\text{ssd}}(d_\text{dist}) \big)}_\text{promote lat.} \\ &+ (1 - p_\text{hit}) p_\text{write} \underbrace{\big( E(d_\text{ssd}) + E_{b,\text{hdd}}(d_\text{dist}) \big)}_\text{flush lat.} \\ &+ d_\text{software} \\ \text{s.t.} \hspace{1em} E_{r,\text{type}} (d_\text{dist}) \approx\ &E(\max_{r-1} d_\text{network}) + E(\max_r d_\text{type}), \text{type} \in \{\text{ssd}, \text{hdd}\} \\ \\ E(t_\text{cached}|_r) =\ &(r - 1) (1 - p_\text{hit}) (1 + p_\text{write}) E(t_\text{IO}) \end{aligned}\]

Dive-In

Logic

src/osd/OSDMap.h/OSDMap::map_to_pg()

  int map_to_pg(
    int64_t pool,
    const string& name,
    const string& key,
    const string& nspace,
    pg_t *pg) const;

src/osd/OSDMap.h/OSDMap::object_locator_to_pg()

Helper function of OSDMap::map_to_pg(), used more often.

  int object_locator_to_pg(const object_t& oid, const object_locator_t& loc,
                           pg_t &pg) const;
  pg_t object_locator_to_pg(const object_t& oid,
                            const object_locator_t& loc) const {
    pg_t pg;
    int ret = object_locator_to_pg(oid, loc, pg);
    ceph_assert(ret == 0);
    return pg;
  }

src/osd/osd_types.h/pg_pool_t::hash_key()

  /// hash a object name+namespace key to a hash position
  uint32_t hash_key(const string& key, const string& ns) const;

src/osd/OSDMap.h/OSDMap::raw_pg_to_pg()

   pg_t raw_pg_to_pg(pg_t pg) const {
     auto p = pools.find(pg.pool());
     ceph_assert(p != pools.end());
     return p->second.raw_pg_to_pg(pg);
   }

src/osd/osd_types.h/pg_pool_t::raw_pg_to_pg()

   /*
    * map a raw pg (with full precision ps) into an actual pg, for storage
    */
   pg_t raw_pg_to_pg(pg_t pg) const;

src/osd/OSDMap.h/OSDMap::pg_to_up_acting_osds()

   /**
    * map a pg to its acting set as well as its up set. You must use
    * the acting set for data mapping purposes, but some users will
    * also find the up set useful for things like deciding what to
    * set as pg_temp.
    * Each of these pointers must be non-NULL.
    */
   void pg_to_up_acting_osds(pg_t pg, vector<int> *up, int *up_primary,
                             vector<int> *acting, int *acting_primary) const {
     _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary);
   }

src/osd/OSDMap.h/OSDMap::_pg_to_up_acting_osds()

   /**
    *  map to up and acting. Fills in whatever fields are non-NULL.
    */
   void _pg_to_up_acting_osds(const pg_t& pg, vector<int> *up, int *up_primary,
                              vector<int> *acting, int *acting_primary,
                              bool raw_pg_to_pg = true) const;

src/osd/OSDMap.h/OSDMap::_get_temp_osds()

Spits out empty active set when OSD mapping is stable (for temp_* members will not be specified then), and OSDMap:_pg_to_raw_osds() gets executed.

temp_osd stands for OSDs that are temporarily responsible and currently available for incoming I/O.

See here for why the temp mechanism is needed.

   /**
    * Get the pg and primary temp, if they are specified.
    * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
    * @param temp_primary [out] Will be the value in primary_temp, or a value derived
    * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
    */
   void _get_temp_osds(const pg_pool_t& pool, pg_t pg,
                       vector<int> *temp_pg, int *temp_primary) const;

src/osd/OSDMap.h/OSDMap::_pg_to_raw_osds()

Invokes CrushWrapper::do_rule().

To mess with the default PG-to-OSD placement policy, we may make our changes in this function.

   /// pg -> (raw osd list)
   void _pg_to_raw_osds(
     const pg_pool_t& pool, pg_t pg,
     vector<int> *osds,
     ps_t *ppps) const;

Command-Line

src/mon/OSDMonitor.h/OSDMonitor::preprocess_command()

    bool preprocess_command(MonOpRequestRef op);

    } else if (prefix == "osd map") {
      string poolstr, objstr, namespacestr;
      cmd_getval(cct, cmdmap, "pool", poolstr);
      cmd_getval(cct, cmdmap, "object", objstr);
      cmd_getval(cct, cmdmap, "nspace", namespacestr);

PREVIOUSComputer System Analysis and Performance Evaluation

NEXTCeph and Paxos