center of tech

Okay, my fix for the memory leaks on the MDS went fine. Now I’ve got a problem on the DS:
> ::findleaks
CACHE LEAKED BUFCTL CALLER
ffffff01c682a860 1 ffffff01d49a0068 dserv_mds_do_reportavail+0x210
ffffff01c68262e0 520364 ffffff01ea054458 dserv_nnode_from_fh_ds+0x5b
ffffff01c68262e0 1258470 ffffff01f3ed5db0 dserv_nnode_from_fh_ds+0x5b
ffffff01c68262e0 1343031 ffffff01f3ed5e88 dserv_nnode_from_fh_ds+0x77
ffffff01c68262e0 435756 ffffff01ee4b65f8 dserv_nnode_from_fh_ds+0x77
ffffff01c68262e0 1 ffffff01de76e558 mds_compound+0x54
ffffff01c68262e0 3 ffffff01e83df668 mds_compound+0x54
ffffff01c682b2e0 4 ffffff01d99cea30 mds_compound+0x193
ffffff01c6828020 2 ffffff01dcd9f8f8 mds_get_server_impl_id+0x30
ffffff01c68262e0 2 ffffff01dd4350d8 mds_get_server_impl_id+0x58
ffffff01c6826b20 2 ffffff01d7153290 mds_get_server_impl_id+0x8a
ffffff01c6828860 1 ffffff01d8805120 modinstall+0x129
ffffff01c6828860 1 ffffff01d7fe7a98 modinstall+0x129
ffffff01c682b2e0 1 ffffff01ddd5a870 modinstall+0x129
ffffff01c6828020 1 ffffff0f82dd4b68 rpc_init_taglist+0x25
ffffff01c6828020 20145 ffffff020b8337a0 rpc_init_taglist+0x25
ffffff01c6828020 1 ffffff01d613e538 rpc_init_taglist+0x25
ffffff01c6828020 11460 ffffff01e20b1de8 rpc_init_taglist+0x25
ffffff01c6828020 1 ffffff0f82de32e0 rpc_init_taglist+0x25
ffffff01c6828020 1 ffffff0f82de1708 rpc_init_taglist+0x25
ffffff01c6828020 2 ffffff01ddda3018 rpc_init_taglist+0x25
ffffff01c68265a0 2 ffffff01dc8f41b0 tohex+0x32
ffffff01c6828020 3 ffffff01dcda06c0 xdr_array+0xae
ffffff01c6828020 1 ffffff01df04e7f8 xdr_array+0xae
ffffff01c68262e0 1083945 ffffff01edde37b0 xdr_bytes+0x70
ffffff01c68282e0 1 ffffff01f31f9e60 xdr_bytes+0x70
ffffff01c68285a0 1 ffffff01e1798598 xdr_bytes+0x70
ffffff01c68262e0 694948 ffffff01f3ed5c00 xdr_bytes+0x70
------------------------------------------------------------------------
Total 5368151 buffers, 86904136 bytes
Even though we still see a high number of leaks in xdr_bytes(), which we
expect to indicate that the mds_sid is not getting cleaned up, I’ll argue
that this is to be expected with the high number of leaks in
dserv_nnode_from_fh_ds(). I.e., if the file handle is not being nuked,
we can’t expect the mds_sid component to also be zapped.
> ffffff01ea054458$<bufctl_audit
ADDR BUFADDR TIMESTAMP THREAD
CACHE LASTLOG CONTENTS
ffffff01ea054458 ffffff01e9f71d50 89f9dc8f5e7 ffffff01d7aa93a0
ffffff01c68262e0 ffffff01c747f9c0 ffffff01cfd78cd8
kmem_cache_alloc_debug+0x283
kmem_cache_alloc+0xa9
kmem_alloc+0xa3
dserv_nnode_from_fh_ds+0x5b
nnode_from_fh_v41+0x45
mds_op_putfh+0xe3
rfs41_op_dispatch+0x72
mds_compound+0x1d4
rfs41_dispatch+0x17c
dispatch_dserv_nfsv41+0x93
svc_getreq+0x20d
svc_run+0x197
svc_do_run+0x81
nfssys+0xa0e
The issue is actually very simple to understand - prior to my changes, the dnk_sid
field was not being used. But that field is a mds_sid, which requires a memory
allocation to use:
426 static nnode_error_t
427 dserv_nnode_from_fh_ds(nnode_t **npp, mds_ds_fh *fhp)
428 {
429 dserv_nnode_key_t dskey;
430 nnode_key_t key;
431 uint32_t hash;
432
433 if (fhp->vers fh.v1.mds_fid.len fh.v1.mds_fid.len > DS_MAXFIDSZ)
438 return (ESTALE);
439 /* XXX cannot do sid until mds_sid is sane */
440 dskey.dnk_sid = NULL;
441 dskey.dnk_fid = &fhp->fh.v1.mds_fid;
442
443 hash = dserv_nnode_hash(&dskey);
444
445 key.nk_keydata = &dskey;
446 key.nk_compare = dserv_nnode_compare;
447
448 return (nnode_find_or_create(npp, &key, hash, fhp, dserv_nnode_build));
449 }
I changed this to be:
425 static nnode_error_t
426 dserv_nnode_from_fh_ds(nnode_t **npp, mds_ds_fh *fhp)
427 {
428 dserv_nnode_key_t dskey;
429 nnode_key_t key;
430 uint32_t hash;
431
432 if (fhp->vers fh.v1.mds_fid.len fh.v1.mds_fid.len > DS_MAXFIDSZ)
437 return (ESTALE);
438
439 /*
440 * XXX: Should we be mallocing in the middle of a
441 * cache? Cache entry does not have enough space.
442 * What is the use of the mds_sid here?
443 */
444 dskey.dnk_sid = kmem_alloc(sizeof (mds_sid), KM_SLEEP);
445 dskey.dnk_sid->len = fhp->fh.v1.mds_sid.len;
446 dskey.dnk_sid->val = kmem_alloc(dskey.dnk_sid->len,
447 KM_SLEEP);
448
449 bcopy(fhp->fh.v1.mds_sid.val, dskey.dnk_sid->val,
450 dskey.dnk_sid->len);
451
452 dskey.dnk_fid = &fhp->fh.v1.mds_fid;
453
454 hash = dserv_nnode_hash(&dskey);
455
456 key.nk_keydata = &dskey;
457 key.nk_compare = dserv_nnode_compare;
458
459 return (nnode_find_or_create(npp, &key, hash, fhp, dserv_nnode_build));
460 }
The problem here is that both dskey and key are pulled off of the stack. We
don’t expect them to have a life after dserv_nnode_from_fh_ds() and before my
changes, there was no need to release any memory.
<pThe question becomes is the key copied elsewhere (which means we the mds_sid
might be copied as well) or can we just always release it after 459 (bear with
me on that statement, I’ll fix it up)? We need to look at nnode_find_or_create().
The key is only used here in the find case:
637 rw_enter(&bucket->nb_lock, rw);
638 found = avl_find(&bucket->nb_tree, &key, &where);
639 if (found) {
And indeed, we can easily follow where the compare function has to be from line 457
above:
310 static int
311 dserv_nnode_compare(const void *va, const void *vb)
312 {
313 const dserv_nnode_key_t *a = va;
314 const dserv_nnode_key_t *b = vb;
315 int rc;
316
317 NFS_AVL_COMPARE(a->dnk_sid->len, b->dnk_sid->len);
318 rc = memcmp(a->dnk_sid->val, b->dnk_sid->val, a->dnk_sid->len);
319 NFS_AVL_RETURN(rc);
320
321 NFS_AVL_COMPARE(a->dnk_fid->len, b->dnk_fid->len);
322 rc = memcmp(a->dnk_fid->val, b->dnk_fid->val, a->dnk_fid->len);
323 NFS_AVL_RETURN(rc);
324
325 return (0);
326 }
And I added that code on 317-319!
It is not used in the create case:
674 rc = nnode_build(npp, data, nnbuild); 675 if (rc == 0) 676 avl_insert(&bucket->nb_tree, *npp, where);
So the fix will be:
457 ne = nnode_find_or_create(npp, &key, hash, fhp, dserv_nnode_build); 458 459 kmem_free(dskey.dnk_sid->val, dskey.dnk_sid->len); 460 kmem_free(dskey.dnk_sid, sizeof (mds_sid)); 461 462 return (ne); 463 }
Ouch, I thought I was done! Remember earlier I stated that the xdr_bytes
would be from this code? Well I decided to challenge that statement:
> ffffff01edde37b0$<bufctl_audit
ADDR BUFADDR TIMESTAMP THREAD
CACHE LASTLOG CONTENTS
ffffff01edde37b0 ffffff01edc60168 89f9c4733bc ffffff01d7aa93a0
ffffff01c68262e0 ffffff01c9a25b80 ffffff01cca20030
kmem_cache_alloc_debug+0x283
kmem_cache_alloc+0xa9
kmem_alloc+0xa3
xdr_bytes+0x70
xdr_mds_sid+0x21
xdr_ds_fh_v1+0x68
xdr_ds_fh+0x3f
xdr_decode_nfs41_fh+0xdd
xdr_snfs_argop4+0x5e
xdr_COMPOUND4args_srv+0xf4
svc_authany_wrap+0x22
svc_cots_kgetargs+0x41
dispatch_dserv_nfsv41+0x5d
svc_getreq+0x20d
svc_run+0x197
In retrospect, “D’oh!”, we can see we are not decoding the xdr here. But then,
in my defense, I made that statement before we looked at the code!
Ahh, looks like we have some hand written XDR code:
4034 static bool_t
4035 xdr_snfs_argop4_free(XDR *xdrs, nfs_argop4 **arrayp, int len)
4036 {
...
4056 case OP_PUTFH:
4057 if (array[i].nfs_argop4_u.opputfh.object.nfs_fh4_val !=
4058 NULL) {
4059 kmem_free(array[i].nfs_argop4_u.opputfh.object.
4060 nfs_fh4_val,
4061 array[i].nfs_argop4_u.opputfh.object.
4062 nfs_fh4_len);
4063 }
Which does not free the mds_sid portion of the file handle. This hand encoding
is a nightmare to unravel. I’m going to have to pass in the minorversion:
5022 (void) xdr_snfs_argop4_free(xdrs, &objp->array, 5023 objp->array_len, objp->minorversion);
to determine whether or not to check for the mds_sid. If it is a minorversion,
then I’ll need to check whether it is a normal NFSv41 filehandle or a DS filehandle:
4057 case OP_PUTFH:
4058 if (array[i].nfs_argop4_u.opputfh.object.nfs_fh4_val ==
4059 NULL)
4060 continue;
4061
4062 if (minorversion != 0) {
4063 struct mds_ds_fh *dsfh =
4064 (struct mds_ds_fh *)array[i].nfs_argop4_u.
4065 opputfh.object.nfs_fh4_val;
4066
4067 /*
4068 * Is it really a DS filehandle?
4069 */
4070 if (fh4->type == FH41_TYPE_DMU_DS) {
4071 mds_sid *sid = &dsfh->fh.v1.mds_sid;
4072 if (sid->val) {
4073 kmem_free(sid->val, sid->len);
4074 }
4075 }
4076 }
4077
4078 kmem_free(array[i].nfs_argop4_u.opputfh.object.
4079 nfs_fh4_val,
4080 array[i].nfs_argop4_u.opputfh.object.
4081 nfs_fh4_len);
The key to understanding this code (and the headache) is to look at the definitions
of our filehandles:
typedef struct {
nfs41_fh_type_t type;
nfs41_fh_vers_t vers;
union {
nfs41_fh_v1_t v1;
/* new versions will be added here */
} fh;
} nfs41_fh_fmt_t;
struct mds_ds_fh {
nfs41_fh_type_t type; /* MDS or DS? */
ds_fh_version vers; /* OTW version number */
union {
ds_fh_v1 v1;
/* new versions will be added here */
} fh;
};
typedef struct mds_ds_fh mds_ds_fh;
You can typecast nfs_fh4_val in several ways, i.e.:
4063 struct mds_ds_fh *dsfh = 4064 (struct mds_ds_fh *)array[i].nfs_argop4_u. 4065 opputfh.object.nfs_fh4_val;
but you always have to check the type to make sure it is valid. The key
to understanding this comes out of nfs41_filehandle_xdr.c:
132 bool_t
133 xdr_decode_nfs41_fh(XDR *xdrs, nfs_fh4 *objp)
134 {
135
136 uint_t otw_len;
137 uint_t type;
138
139 ASSERT(xdrs->x_op == XDR_DECODE);
140
141 objp->nfs_fh4_val = NULL;
142 objp->nfs_fh4_len = 0;
...
150 /* Get the filehandle type */
151 if (!xdr_enum(xdrs, (enum_t *)&type))
152 return (FALSE);
153
154 switch (type) {
155 case FH41_TYPE_NFS: {
156 nfs41_fh_fmt_t *nfhp = NULL;
157
158 nfhp = kmem_zalloc(sizeof (nfs41_fh_fmt_t), KM_SLEEP);
159 nfhp->type = FH41_TYPE_NFS;
...
170 case FH41_TYPE_DMU_DS: {
171 struct mds_ds_fh *dfhp = NULL;
172
173 dfhp = kmem_zalloc(sizeof (struct mds_ds_fh), KM_SLEEP);
174 dfhp->type = FH41_TYPE_DMU_DS;
Since both structures have the type as the first field, after the typecast,
we will be able to determine if that was valid to do.
So the first memory leak I fixed was one I had added. The second one was
a pre-existing one. Running findleaks has saved my bacon here.
And of course, I’m going to have to check one more time to make sure that
my fixes plugged the leaks!
Source/Kaynak : http://blogs.sun.com/tdh/entry/more_memory_leaks_but_this