ceph version: Kraken ObjectStore获取文件系统的fsid。OSD在用户态又构造了一层自己文件系统来管理数据,并为其分配了唯一标识UUID。该UUID是其文件系统元信息中的一员,底层使用的驱动不同其保存的位置也不同,如BlueStore,保存在块设备的第一个块中,FileStore,保存在日志设备中的第一个块中。
获取fsid方法:
int ObjectStore::probe_block_device_fsid(
CephContext *cct,
const string& path,
uuid_d *fsid)
{
int r;
#if defined(HAVE_LIBAIO)
r = BlueStore::get_block_device_fsid(cct, path, fsid);
if (r ==
0) {
¦ lgeneric_dout(cct,
0) << __func__ <<
" " << path <<
" is bluestore, "
<< *fsid << dendl;
¦
return r;
}
#endif
r = FileStore::get_block_device_fsid(cct, path, fsid);
if (r ==
0) {
¦ lgeneric_dout(cct,
0) << __func__ <<
" " << path <<
" is filestore, "
<< *fsid << dendl;
¦
return r;
}
return -EINVAL;
}
BlueStore 获取osd文件系统的OSD uuid,该uuid保存在内存结构的bluestore_bdev_label_t,该结构保存在磁盘的第一个块中。
int BlueStore::get_block_device_fsid(CephContext* cct,
const string& path,
¦ ¦ ¦uuid_d *fsid)
{
bluestore_bdev_label_t label;
int r = _read_bdev_label(cct, path, &label);
if (r <
0)
¦
return r;
*fsid = label.osd_uuid;
return 0;
}
读取第一个block,反序列化得到label
int BlueStore
::_read_bdev_label(CephContext
* cct,
string path,
bluestore_bdev_label_t
*label)
{
dout(
10)
<< __func__
<< dendl;
int fd
= ::open(path
.c_str(), O_RDONLY);
if (fd
< 0) {
¦ fd
= -errno;
¦ derr
<< __func__
<< " failed to open " << path
<< ": " << cpp_strerror(fd)
<< dendl;
¦
return fd;
}
bufferlist bl;
int r
= bl
.read_fd(fd, BDEV_LABEL_BLOCK_SIZE);
VOID_TEMP_FAILURE_RETRY(
::close(fd));
if (r
< 0) {
¦ derr
<< __func__
<< " failed to read from " << path
<< ": " << cpp_strerror(r)
<< dendl;
¦
return r;
}
uint32_t crc, expected_crc;
bufferlist
::iterator p
= bl
.begin();
try {
¦
::decode(
*label, p);
¦ bufferlist t;
¦ t
.substr_of(bl,
0, p
.get_off());
¦ crc
= t
.crc32c(
-1);
¦
::decode(expected_crc, p);
}
catch (buffer
::error& e) {
¦ derr
<< __func__
<< " unable to decode label at offset " << p
.get_off()
<< ": " << e
.what()
<< dendl;
¦
return -EINVAL;
}
if (crc
!= expected_crc) {
¦ derr
<< __func__
<< " bad crc on label, expected " << expected_crc
<< " != actual " << crc
<< dendl;
¦
return -EIO;
}
dout(
10)
<< __func__
<< " got " << *label
<< dendl;
return 0;
}
FileStore 获取osd文件系统的OSD uuid
int FileStore::get_block_device_fsid(CephContext* cct,
const string& path,
uuid_d *fsid)
{
FileJournal j(cct, *fsid,
0,
0, path.c_str(),
false,
false);
return j.peek_fsid(*fsid);
}
int FileJournal::peek_fsid(uuid_d& fsid)
{
assert(fd == -
1);
int r = _open(
false,
false);
if (r)
¦
return r;
r = read_header(&header);
if (r <
0)
¦
goto out;
fsid = header.fsid;
out:
close();
return r;
}
int FileJournal::_open(
bool forwrite,
bool create)
{
int flags, ret;
if (forwrite) {
¦ flags = O_RDWR;
¦
if (directio)
¦ ¦ flags |= O_DIRECT | O_DSYNC;
}
else {
¦ flags = O_RDONLY;
}
if (create)
¦ flags |= O_CREAT;
if (fd >=
0) {
¦
if (TEMP_FAILURE_RETRY(::close(fd))) {
¦ ¦
int err = errno;
¦ ¦ derr <<
"FileJournal::_open: error closing old fd: "
¦ ¦<< cpp_strerror(err) << dendl;
¦ }
}
fd = TEMP_FAILURE_RETRY(::open(fn.c_str(), flags,
0644));
if (fd <
0) {
¦
int err = errno;
¦ dout(
2) <<
"FileJournal::_open unable to open journal "
¦ ¦ << fn <<
": " << cpp_strerror(err) << dendl;
¦
return -err;
}
struct stat st;
ret = ::fstat(fd, &st);
if (ret) {
¦ ret = errno;
¦ derr <<
"FileJournal::_open: unable to fstat journal: " << cpp_strerror(ret) << dendl;
¦ ret = -ret;
¦
goto out_fd;
}
if (S_ISBLK(st.st_mode)) {
¦ ret = _open_block_device();
}
else if (S_ISREG(st.st_mode)) {
¦
if (aio && !force_aio) {
¦ ¦ derr <<
"FileJournal::_open: disabling aio for non-block journal. Use "
¦ ¦<<
"journal_force_aio to force use of aio anyway" << dendl;
¦ ¦ aio =
false;
¦ }
¦ ret = _open_file(st.st_size, st.st_blksize, create);
}
else {
¦ derr <<
"FileJournal::_open: wrong journal file type: " << st.st_mode
¦<< dendl;
¦ ret = -EINVAL;
}
if (ret)
¦
goto out_fd;
#ifdef HAVE_LIBAIO
if (aio) {
¦ aio_ctx =
0;
¦ ret = io_setup(
128, &aio_ctx);
¦
if (ret <
0) {
¦ ¦
switch (ret) {
case -EAGAIN:
¦ derr <<
"FileJournal::_open: user's limit of aio events exceeded. "
¦ ¦ ¦ ¦<<
"Try increasing /proc/sys/fs/aio-max-nr" << dendl;
¦
break;
default:
¦ derr <<
"FileJournal::_open: unable to setup io_context " << cpp_strerror(-ret) << dendl;
¦
break;
¦ ¦ }
¦ ¦
goto out_fd;
¦ }
}
#endif
max_size -= max_size % block_size;
dout(
1) <<
"_open " << fn <<
" fd " << fd
¦ <<
": " << max_size
¦ <<
" bytes, block size " << block_size
¦ <<
" bytes, directio = " << directio
¦ <<
", aio = " << aio
¦ << dendl;
return 0;
out_fd:
VOID_TEMP_FAILURE_RETRY(::close(fd));
fd = -
1;
return ret;
}
获取块设备的大小
获取块设备大小,检查是否大于最小日志大小要求。
int FileJournal::_open_block_device()
{
int64_t bdev_sz =
0;
int ret = get_block_device_size(fd, &bdev_sz);
if (ret) {
¦ dout(
0) << __func__ <<
": failed to read block device size." << dendl;
¦
return -EIO;
}
if (bdev_sz < ONE_MEG) {
¦ dout(
0) << __func__ <<
": your block device must be at least "
¦ ¦ << ONE_MEG <<
" bytes to be used for a Ceph journal." << dendl;
¦
return -EINVAL;
}
dout(
10) << __func__ <<
": ignoring osd journal size. "
¦ ¦<<
"We'll use the entire block device (size: " << bdev_sz <<
")"
¦ ¦<< dendl;
max_size = bdev_sz;
block_size = cct->_conf->journal_block_size;
if (cct->_conf->journal_discard) {
¦ discard = block_device_support_discard(fn.c_str());
¦ dout(
10) << fn <<
" support discard: " << (
int)discard << dendl;
}
return 0;
}
int get_block_device_size(
int fd, int64_t *psize)
{
#ifdef BLKGETSIZE64
int ret = ::ioctl(fd, BLKGETSIZE64, psize);
#elif defined(BLKGETSIZE)
unsigned
long sectors =
0;
int ret = ::ioctl(fd, BLKGETSIZE, §ors);
*psize = sectors *
512ULL;
#else
# error "Linux configuration error (get_block_device_size)"
#endif
if (ret <
0)
ret = -errno;
return ret;
}
记录OSD日志的是一个文件,会使用该方法来打开该日志文件。
int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
bool create)
{
int ret;
int64_t conf_journal_sz(cct->_conf->osd_journal_size);
conf_journal_sz <<=
20;
if ((cct->_conf->osd_journal_size ==
0) && (oldsize < ONE_MEG)) {
¦ derr <<
"I'm sorry, I don't know how large of a journal to create."
¦<<
"Please specify a block device to use as the journal OR "
¦<<
"set osd_journal_size in your ceph.conf" << dendl;
¦
return -EINVAL;
}
if (create && (oldsize < conf_journal_sz)) {
¦ uint64_t newsize(conf_journal_sz);
¦ dout(
10) << __func__ <<
" _open extending to " << newsize <<
" bytes" << dendl;
¦ ret = ::ftruncate(fd, newsize);
¦
if (ret <
0) {
¦ ¦
int err = errno;
¦ ¦ derr <<
"FileJournal::_open_file : unable to extend journal to "
¦ ¦<< newsize <<
" bytes: " << cpp_strerror(err) << dendl;
¦ ¦
return -err;
¦ }
#ifdef HAVE_POSIX_FALLOCATE
¦ ret = ::posix_fallocate(fd,
0, newsize);
¦
if (ret) {
¦ ¦ derr <<
"FileJournal::_open_file : unable to preallocation journal to "
¦ ¦<< newsize <<
" bytes: " << cpp_strerror(ret) << dendl;
¦ ¦
return -ret;
¦ }
¦ max_size = newsize;
#elif defined(__APPLE__)
¦ fstore_t store;
¦ store.fst_flags = F_ALLOCATECONTIG;
¦ store.fst_posmode = F_PEOFPOSMODE;
¦ store.fst_offset =
0;
¦ store.fst_length = newsize;
¦ ret = ::fcntl(fd, F_PREALLOCATE, &store);
¦
if (ret == -
1) {
¦ ¦ ret = -errno;
¦ ¦ derr <<
"FileJournal::_open_file : unable to preallocation journal to "
¦ ¦<< newsize <<
" bytes: " << cpp_strerror(ret) << dendl;
¦ ¦
return ret;
¦ }
¦ max_size = newsize;
#else
# error "Journal pre-allocation not supported on platform."
#endif
}
else {
¦ max_size = oldsize;
}
block_size = cct->_conf->journal_block_size;
if (create && cct->_conf->journal_zero_on_create) {
¦ derr <<
"FileJournal::_open_file : zeroing journal" << dendl;
¦ uint64_t write_size =
1 <<
20;
¦
char *buf;
¦ ret = ::posix_memalign((
void **)&buf, block_size, write_size);
¦
if (ret !=
0) {
¦ ¦
return -ret;
¦ }
¦
memset(
static_cast<
void*>(buf),
0, write_size);
¦ uint64_t i =
0;
¦
for (; (i + write_size) <= (uint64_t)max_size; i += write_size) {
¦ ¦ ret = ::pwrite(fd,
static_cast<
void*>(buf), write_size, i);
¦ ¦
if (ret <
0) {
free(buf);
return -errno;
¦ ¦ }
¦ }
¦
if (i < (uint64_t)max_size) {
¦ ¦ ret = ::pwrite(fd,
static_cast<
void*>(buf), max_size - i, i);
¦ ¦
if (ret <
0) {
free(buf);
return -errno;
¦ ¦ }
¦ }
¦
free(buf);
}
dout(
10) <<
"_open journal is not a block device, NOT checking disk "
¦ ¦ ¦ ¦ ¦<<
"write cache on '" << fn <<
"'" << dendl;
return 0;
}
读取日志的头,该头在日志的第一个块中
int FileJournal
::read_header(header_t
*hdr) const
{
dout(
10)
<< "read_header" << dendl;
bufferlist bl;
buffer
::ptr bp
= buffer
::create_page_aligned(block_size);
char
* bpdata
= bp
.c_str();
int r
= ::pread(fd, bpdata, bp
.length(),
0);
if (r
< 0) {
¦ int err
= errno;
¦ dout(
0)
<< "read_header got " << cpp_strerror(err)
<< dendl;
¦
return -err;
}
if (bp
.length()
!= (size_t)r) {
¦ ¦
¦ ¦ bpdata
+= r;
¦ ¦ memset(bpdata,
0, bp
.length()
- r);
}
bl
.push_back(std
::move(bp));
try {
¦ bufferlist
::iterator p
= bl
.begin();
¦
::decode(
*hdr, p);
}
catch (buffer
::error& e) {
¦ derr
<< "read_header error decoding journal header" << dendl;
¦
return -EINVAL;
}
if (hdr
->flags
> 3) {
¦ derr
<< "read_header appears to have gibberish flags; assuming 0" << dendl;
¦ hdr
->flags
= 0;
}
print_header(
*hdr);
return 0;
}
void FileJournal
::print_header(const header_t
&header) const
{
dout(
10)
<< "header: block_size " << header.block_size
¦ ¦
<< " alignment " << header.alignment
¦ ¦
<< " max_size " << header.max_size
¦ ¦
<< dendl;
dout(
10)
<< "header: start " << header.start
<< dendl;
dout(
10)
<< " write_pos " << write_pos
<< dendl;
}