How calling parameters and return data are encoded in Ceph Objecter RPC.
based on Ceph v16.2.5
Code Teardown Reminder
- Encoding procedure of invocation can be found at
src\osdc\Objecter.h\Objecter::prepare_<op>_op()
andsrc\osdc\Objecter.h\ObjectOperation
. - Decoding procedure of returned data can be referenced at
src\osdc\Objecter.h\ObjectOperation::struct CB_ObjectOperation_<op>::operator()()
orsrc\osdc\Objecter.h\Objecter::C_<Op>::finish()
, which may very likely be in the same order as they were encoded insrc\osd\PrimaryLogPG.cc\PrimaryLogPG::do_osd_ops()
.
Key Data Structures
Packaged OSD Operations
struct OSDOp;
using osdc_opvec = boost::container::small_vector<OSDOp, osdc_opvec_len>;
/* src/osdc/Objecter.h */
struct ObjectOperation {
osdc_opvec ops; // OSDOp[]
int flags = 0;
int priority = 0;
boost::container::small_vector<ceph::buffer::list*, osdc_opvec_len> out_bl;
boost::container::small_vector<
fu2::unique_function<void(boost::system::error_code, int,
const ceph::buffer::list& bl) &&>,
osdc_opvec_len> out_handler;
boost::container::small_vector<int*, osdc_opvec_len> out_rval;
boost::container::small_vector<boost::system::error_code*,
osdc_opvec_len> out_ec;
/* ... */
}; /* struct ObjectOperation */
namespace Objecter {
struct op_target_t {
/* ... */
/** the key/ID of object to operate on */
object_t base_oid; // { std::string name; }
/** provides the pool in which the object resides */
object_locator_t base_oloc; // { int64_t pool; ... }
/* to be calculated with Objecter::_calc_target() */
object_t target_oid;
object_locator_t target_oloc;
/* ... */
}; /* struct op_target_t */
struct Op {
/* ... */
op_target_t target;
/* ... */
/** in short OSDOp[], with optimized allocation code path */
osdc_opvec ops;
snapid_t snapid = CEPH_NOSNAP;
SnapContext snapc;
/** modify time */
ceph::real_time mtime;
/**
* legacy culmulative output
*
* Used only by certain read ops (getxattr, read, stat), and only when the
* overall `Op` has **only one** OSDOp. The content of the only effective
* `outdata` is then copied to the upper-layer-supplied bufferlist pointed
* to by this pointer (transfered over the network by the dedicated
* `Message::data` frame segment).
*/
ceph::buffer::list *outbl = nullptr;
/**
* individual output of each member op, which are just pointers to underlying
* fields of individual ops. zeroed by default, same size as `ops`
*/
boost::container::small_vector<ceph::buffer::list*, osdc_opvec_len> out_bl;
/**
* immediate callbacks, executed right after receiving reply
* @sa Objecter::handle_osd_op_reply(MOSDOpReply*)
*/
boost::container::small_vector<
fu2::unique_function<void(boost::system::error_code, int,
const ceph::buffer::list& bl) &&>,
osdc_opvec_len> out_handler;
boost::container::small_vector<int*, osdc_opvec_len> out_rval;
boost::container::small_vector<boost::system::error_code*,
osdc_opvec_len> out_ec;
/* ... */
}; /* struct Op */
} /* namespace Objecter */
Individual OSD Operation
struct ceph_osd_op;
struct sobject_t;
/* src/osd/osd_types.h */
struct OSDOp {
ceph_osd_op op; // opcode, flag and extra calling args
sobject_t soid; // stripped object name, not used by CephFS, for it has
// its own stripping (or they call layout) logic
// (see src/osdc/Striper.cc/Striper::file_to_extents()).
// and a metadata object always has object size equal to
// strip size, which is 4MiB
/* `outdata` (should be) pointed to by `out_bl`s, if need to claim return
data, so we may just stuff return data here, should be safe (?) */
ceph::buffer::list indata, outdata;
errorcode32_t rval = 0;
/* ... */
}; /* struct OSDOp */
/* src/include/rados.h */
struct ceph_osd_op {
__le16 op; /* CEPH_OSD_OP_* */
__le32 flags; /* CEPH_OSD_OP_FLAG_* */
union {
struct {
__le64 offset, length;
__le64 truncate_size;
__le32 truncate_seq;
} __attribute__ ((packed)) extent;
struct {
__le32 name_len;
__le32 value_len;
__u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
__u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
} __attribute__ ((packed)) xattr;
struct {
__u8 class_len;
__u8 method_len;
__u8 argc;
__le32 indata_len;
} __attribute__ ((packed)) cls;
struct {
__le64 count;
__le32 start_epoch; /* for the pgls sequence */
} __attribute__ ((packed)) pgls;
struct {
__le64 snapid;
} __attribute__ ((packed)) snap;
struct {
__le64 cookie;
__le64 ver; /* no longer used */
__u8 op; /* CEPH_OSD_WATCH_OP_* */
__u32 gen; /* registration generation */
__u32 timeout; /* connection timeout */
} __attribute__ ((packed)) watch;
struct {
__le64 cookie;
} __attribute__ ((packed)) notify;
struct {
__le64 unused;
__le64 ver;
} __attribute__ ((packed)) assert_ver;
struct {
__le64 offset, length;
__le64 src_offset;
} __attribute__ ((packed)) clonerange;
struct {
__le64 max; /* max data in reply */
} __attribute__ ((packed)) copy_get;
struct {
__le64 snapid;
__le64 src_version;
__u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */
/*
* CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags
* for src object, flags for dest object are in
* ceph_osd_op::flags.
*/
__le32 src_fadvise_flags;
} __attribute__ ((packed)) copy_from;
struct {
struct ceph_timespec stamp;
} __attribute__ ((packed)) hit_set_get;
struct {
__u8 flags;
} __attribute__ ((packed)) tmap2omap;
struct {
__le64 expected_object_size;
__le64 expected_write_size;
__le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
} __attribute__ ((packed)) alloc_hint;
struct {
__le64 offset;
__le64 length;
__le64 data_length;
} __attribute__ ((packed)) writesame;
struct {
__le64 offset;
__le64 length;
__le32 chunk_size;
__u8 type; /* CEPH_OSD_CHECKSUM_OP_TYPE_* */
} __attribute__ ((packed)) checksum;
} __attribute__ ((packed));
__le32 payload_len;
} __attribute__ ((packed));
/* src/include/object.h */
/** short for stripped object */
struct sobject_t {
object_t oid; // { std::string name; }
snapid_t snap; // { uint64_t val; }
/* ... */
}; /* struct sobject_t */
/* src/osdc/Objecter.cc */
void Objecter::_op_submit(Op *op, ceph::shunique_lock<ceph::shared_mutex>& lc,
ceph_tid_t *ptid);
/* src/osd/PrimaryLogPG.cc */
int PrimaryLogPG::do_osd_ops(OpContext *ctx, std::vector<OSDOp>& ops);
ObjectOperation
and Op
are virtually the same in terms of data fields.
ObjectOperation
only fills parameters of operations, it is generally a helper
class used to generate a transactional operation that contains multiple ops,
while Op
makes the final op_submit()
call. E.g.
/* src/mds/CDir.cc/CDir::_omap_fetch_more() */
ObjectOperation rd;
rd.omap_get_vals(...);
mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, NULL, 0,
new C_OnFinisher(fin, mdcache->mds->finisher));
/* results in omap_get_vals + read, atomic & consistent */
bufferlist-based RPC Format
Snapshot related info may be ambiguous or incorrect.
Variable | Type |
---|---|
o |
Objecter::Op* |
osd_op |
OSDOp& |
CEPH_OSD_OP_STAT
- context
o->snapid
(Self-managed snap)AFAIK the per-object self-managed snap, is not used by CephFS.
CEPH_NOSNAP
is((__u64)-2)
, i.e. 18446744073709551614.o->target.base_oloc.pool
(omitted later)o->target.base_oid
(omitted later)
- return data via
osd_op.outdata
(omitted later)uint64_t size
utime_t mtime
- return value
osd_op.rval
(omitted later)0
on success-ENOENT
on not exist
- context
CEPH_OSD_OP_CREATE
- context
o->mtime
o->snapc
- parameters
osd_op.flags
osd_op.soid
osd_op.indata
string category
(not implemented and not used)
- return data
- none
- return value
0
on success-EEXIST
on object already present andCEPH_OSD_OP_FLAG_EXCL
exclusive create flag is set-EINVAL
on decoding error, currently unreachable
- context
CEPH_OSD_OP_DELETE
(namelyObjecter::ObjectOperation::remove()
)- context
o->mtime
o->snapc
- parameters
osd_op.soid
- return data
- none
- return value
0
on success-ENOENT
on object does not exist
- context
CEPH_OSD_OP_READ
- context
o->snapid
- parameters
osd_op.extent
.offset
.length
if 0 read the whole object.truncate_size
.truncate_seq
- return data
- object content
- return value
0
on success-ENOENT
on object not exist-EIO
on CRC mismatch and notCEPH_OSD_OP_FLAG_FAILOK
- or whatever lower level throws,
-EAGAIN
and such
- return successful read length via
osd_op.extent.length
- context
CEPH_OSD_OP_WRITE
- context
o->snapc
o->mtime
- parameters
osd_op.extent
.offset
.length == osd_op.indata.length()
.truncate_size
.truncate_seq
osd_op.flags
CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
osd_op.indata
- new content
- return data
- return value
0
on success-EINVAL
invalid parameter, e.g. whenosd_op.extent.length != osd_op.indata.length()
-EOPNOSUPP
on unaligned append whenpool.info.requires_aligned_append()
is true-EFBIG
exceedingosd_max_object_size
Write may create object silently.
- context
CEPH_OSD_OP_WRITEFULL
- context
o->snapc
o->mtime
- parameters
osd_op.extent
.offset
.length == osd_op.indata.length()
osd_op.flags
osd_op.indata
- new content
- return value
0
on success-EINVAL
invalid parameter, e.g. whenosd_op.extent.length != osd_op.indata.length()
Truncate object to
osd_op.length
if already exists.- context
CEPH_OSD_OP_OMAPSETHEADER
- context (from
Objecter::mutate()
)o->snapc
o->mtime
- parameters
osd_op.extent
.offset == 0
.length == osd_op.indata.length()
osd_op.indata
- header content
- return value
0
on success-EOPNOTSUPP
on pool does not support omap
May create object silently.
- context (from
CEPH_OSD_OP_OMAPSETVALS
- context
o->snapc
o->mtime
- parameters
osd_op.extent
.offset == 0
.length == osd_op.indata.length()
osd_op.indata
- encoded
std::map<string, bufferlist>
orboost::container::flat_map
key-value pairsUse
src/os/Transaction.cc/decode_str_str_map_to_bl()
to first extract effective segment in the overall bl. Then decode the extracted proportion entirely.See
src/osd/PrimaryLogPG.cc/PrimaryLogPG::do_osd_ops()
case CEPH_OSD_OP_OMAPSETVALS
.
- encoded
- return value
0
on success-EINVAL
on decode error-EOPNOTSUPP
on pool does not support omap
May create object silently.
- context
CEPH_OSD_OP_OMAPRMKEYS
- context
o->snapc
o->mtime
- parameters
osd_op.extent
.offset == 0
.length == osd_op.indata.length()
osd_op.indata
- encoded
std::set<string>
orboost::container::flat_set
src/os/Transaction.cc/decode_str_set_to_bl()
- encoded
- return value
0
on success-ENOENT
on object not exist-EINVAL
on parameter decode error-EOPNOTSUPP
on pool does not support omap
- context
Below are operations not so common for MDS services.
CEPH_OSD_OP_OMAPGETHEADER
- context (from
Objecter::read()
)o->snapid
- parameters
- none
- return data
- header content
- return value
0
on success-ENOENT
on object not exist
- context (from
CEPH_OSD_OP_OMAPGETVALS
- context (from
Objecter::read()
)o->snapid
- parameters
osd_op.extent
.offset == 0
.length == osd_op.indata.length()
osd_op.indata
std::string start_after
uint64_t max_to_get
sdt::string filter_prefix
- return data
uint32_t num
total number of retrieved KV pairsstd::string key1
ceph::buffer::list value1
std::string key2
ceph::buffer::list value2
- … more keys
- … more values
bool truncated
if return data is truncated due to reachingmax_to_get
or_conf->osd_max_omap_bytes_per_request
1 ~ 7 can be decoded with
ceph::decode()
overloaded forstd::map<string, bufferlist>
See also
src/osdc/Objecter.h/ObjectOperation/CB_ObjectOperation_decodevals()
- return value
0
on success-EINVAL
on parameter decode error-ENOENT
on object does not exist
- context (from
CEPH_OSD_OP_OMAPGETVALSBYKEY
- context (from
Objecter::read()
)o->snapid
- parameters
osd_op.extent
.offset == 0
.length == osd_op.indata.length()
osd_op.indata
std::set<std::string>
orboost::container::flat_set<std::string>
of keys
- return data
std::map<std::string, bufferlist>
of kv pairs
- return value
0
on success-EINVAL
on parameter decode error-ENOENT
on object does not exist
- context (from
CEPH_OSD_OP_ZERO
- context
o->snapc
o->mtime
- parameters
osd_op.extent
.offset
.length
- return value
0
on success-EOPNOSUPP
if poolrequires_aligned_append()
-EFBIG
exceedingosd_max_object_size
No-op if object does not exist.
- context
CEPH_OSD_OP_OMAPRMKEYS
- context (from
Objecter::mutate()
)o->snapc
o->mtime
- parameters
osd_op.indata
std::set<std::string>
orboost::container::flat_set<std::string>
of keys
- return value
0
on success-ENOSUPPORT
on pool does not support omap-ENOENT
on object not exist-EINVAL
on parameter decode error
- context (from
CEPH_OSD_OP_OMAPCLAER
- context (from
Objecter::mutate()
)o->snapc
o->mtime
- return value
0
on success-EOPNOSUPP
if pool does not support omap-ENOENT
if object does not exist
- context (from
CEPH_OSD_OP_SETXATTR
- context
o->snapc
o->mtime
- parameters
osd_op.xattr
.name_len == strlen(name) or 0 if not provided
.value_len == bl.length()
osd_op.indata
char *name
if provided (trailing'\0'
not included)ceph::buffer::list bl
value
- return value
0
on success-EFBIG
if value larger than_conf->osd_max_attr_size
(if set)-ENAMETOOLONG
if name longer than eitherosd->store->get_max_attr_name_length()
or_conf->osd_max_attr_name_len
May silently create object if not exist.
In original Ceph, the on-disk name for an xattr is
"_<name>"
therefore it is okay to not provide xattr name (the corresponding key in DB would be"_"
).- context
CEPH_OSD_OP_RMXATTR
(namelyObjecter::removexattr()
)- context
o->snapc
o->mtime
- parameters
osd_op.xattr
.name_len == strlen(name) or 0 if not provided
.value_len == bl.length()
osd_op.indata
char *name
if provided
- return value
0
on success-ENOENT
if object does not exist
- context
CEPH_OSD_OP_GETXATTR
- context
o->snapid
o->mtime
- parameters
osd_op.xattr
.name_len == strlen(name) or 0 if not provided
.value_len == 0
osd_op.indata
char *name
if provided
- return data
- xattr value
- return value
0
on success-ENOENT
on object not exist-ENODATA
on xattr not exist
- return value length via
osd_op.xattr.value_len
- context
CEPH_OSD_OP_TRUNCATE
(namelyObjectOperation::truncate()
)- context (from
Objecter::mutate()
)o->snapc
o->mtime
- parameters
osd_op.extent
.offset
.length == 0
osd_op.indata
- empty
ceph::buffer::list
- empty
- return value
0
on success-EOPNOSUPP
if poolrequires_aligned_append()
-EFBIG
if exceedingosd_max_object_size
No-op if object does not exist.
- context (from
CEPH_OSD_OP_TRIMTRUNC
(viasrc/osdc/Filer.cc/Filer::truncate()
)- context (from
Objecter::_modify()
)o->spanc
o->mtime
- parameters
osd_op.extent
.truncate_seq
Truncate sequence seems to be just a sequence number of asynchronous truncation jobs.
See also
src/mds/mdstypes.h/struct inode_t
. For every call toinode_t::truncate()
, the sequence number increases.PrimaryLogPG
treats incoming truncation with no larger seq as no-op..truncate_size
same as.offset
withCEPH_OSD_OP_TRUNCATE
- context (from
About Transaction
After the Op
is constructed on osdc
(OSD Client) side, the Op
is sent to
the remote OSD server for the OSDOp
s in Op
to be actually executed.
For read ops, they are eager executed in PrimaryLogPG::do_osd_ops()
(this
function sits in PrimaryLogPG::prepare_transaction()
procedure, which makes its
naming really really confusing, for it only does / performs part of the
received ops, not all of them). For writes and modifications, on object’s blob
or xattrs or OMAP, they are pushed into PGTransaction
to be later submitted to
PGBackend
.
OSDOp
s of aborted transactions are returned as is, that is, OSD may return
vector of OSDOp
s that has the first few ops done, the one somewhere in the
middle failed and the trailing untouched (default-initialized). The successful
ops will have rval
of 0, the failed one will have negative rval
, and their
handlers are expected to skip decoding for these ops. The untouched ones will have
untouched rval
s, which will probably (behavior defined by handler implementation)
be zero, their decoding handler will get executed, but will no doubt throw
ceph::buffer::error
and then modify its rval = -EIO
.
The supplied callback (i.e. onfinish
in Objecter::handle_osd_op_reply()
),
however, takes the overall return code (i.e. rc
in Objecter::handle_osd_op_reply()
,
which is the result
return of PrimaryLogPG::do_osd_ops()
, the return code of
the failed and not ignored sub-op in the transactional Op
). Typically (behavior
defined by implementation) does nothing or specific error handling if it is not
zero or positive.