1 /*
2  * Copyright 2000-2002 by Hans Reiser, licensing governed by reiserfs/README
3  */
4 
5 /**
6  ** old_item_num
7  ** old_entry_num
8  ** set_entry_sizes
9  ** create_virtual_node
10  ** check_left
11  ** check_right
12  ** directory_part_size
13  ** get_num_ver
14  ** set_parameters
15  ** is_leaf_removable
16  ** are_leaves_removable
17  ** get_empty_nodes
18  ** get_lfree
19  ** get_rfree
20  ** is_left_neighbor_in_cache
21  ** decrement_key
22  ** get_far_parent
23  ** get_parents
24  ** can_node_be_removed
25  ** ip_check_balance
26  ** dc_check_balance_internal
27  ** dc_check_balance_leaf
28  ** dc_check_balance
29  ** check_balance
30  ** get_direct_parent
31  ** get_neighbors
32  ** fix_nodes
33  **
34  **
35  **/
36 
37 
38 #include <linux/config.h>
39 #include <linux/sched.h>
40 #include <linux/string.h>
41 #include <linux/locks.h>
42 #include <linux/reiserfs_fs.h>
43 
44 
45 /* To make any changes in the tree we find a node, that contains item
46    to be changed/deleted or position in the node we insert a new item
47    to. We call this node S. To do balancing we need to decide what we
48    will shift to left/right neighbor, or to a new node, where new item
49    will be etc. To make this analysis simpler we build virtual
50    node. Virtual node is an array of items, that will replace items of
51    node S. (For instance if we are going to delete an item, virtual
52    node does not contain it). Virtual node keeps information about
53    item sizes and types, mergeability of first and last items, sizes
54    of all entries in directory item. We use this array of items when
55    calculating what we can shift to neighbors and how many nodes we
56    have to have if we do not any shiftings, if we shift to left/right
57    neighbor or to both. */
58 
59 
60 /* taking item number in virtual node, returns number of item, that it has in source buffer */
old_item_num(int new_num,int affected_item_num,int mode)61 static inline int old_item_num (int new_num, int affected_item_num, int mode)
62 {
63   if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
64     return new_num;
65 
66   if (mode == M_INSERT) {
67 
68     RFALSE( new_num == 0,
69 	    "vs-8005: for INSERT mode and item number of inserted item");
70 
71     return new_num - 1;
72   }
73 
74   RFALSE( mode != M_DELETE,
75 	  "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode);
76   /* delete mode */
77   return new_num + 1;
78 }
79 
create_virtual_node(struct tree_balance * tb,int h)80 static void create_virtual_node (struct tree_balance * tb, int h)
81 {
82     struct item_head * ih;
83     struct virtual_node * vn = tb->tb_vn;
84     int new_num;
85     struct buffer_head * Sh;	/* this comes from tb->S[h] */
86 
87     Sh = PATH_H_PBUFFER (tb->tb_path, h);
88 
89     /* size of changed node */
90     vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h];
91 
92     /* for internal nodes array if virtual items is not created */
93     if (h) {
94 	vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
95 	return;
96     }
97 
98     /* number of items in virtual node  */
99     vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0);
100 
101     /* first virtual item */
102     vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
103     memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item));
104     vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item);
105 
106 
107     /* first item in the node */
108     ih = B_N_PITEM_HEAD (Sh, 0);
109 
110     /* define the mergeability for 0-th item (if it is not being deleted) */
111     if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
112 	    vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
113 
114     /* go through all items those remain in the virtual node (except for the new (inserted) one) */
115     for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) {
116 	int j;
117 	struct virtual_item * vi = vn->vn_vi + new_num;
118 	int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1);
119 
120 
121 	if (is_affected && vn->vn_mode == M_INSERT)
122 	    continue;
123 
124 	/* get item number in source node */
125 	j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode);
126 
127 	vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
128 	vi->vi_ih = ih + j;
129 	vi->vi_item = B_I_PITEM (Sh, ih + j);
130 	vi->vi_uarea = vn->vn_free_ptr;
131 
132 	// FIXME: there is no check, that item operation did not
133 	// consume too much memory
134 	vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]);
135 	if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
136 	    reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: "
137 			    "virtual node space consumed");
138 
139 	if (!is_affected)
140 	    /* this is not being changed */
141 	    continue;
142 
143 	if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
144 	    vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
145 	    vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted
146 	}
147     }
148 
149 
150     /* virtual inserted item is not defined yet */
151     if (vn->vn_mode == M_INSERT) {
152 	struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num;
153 
154 	RFALSE( vn->vn_ins_ih == 0,
155 		"vs-8040: item header of inserted item is not specified");
156 	vi->vi_item_len = tb->insert_size[0];
157 	vi->vi_ih = vn->vn_ins_ih;
158 	vi->vi_item = vn->vn_data;
159 	vi->vi_uarea = vn->vn_free_ptr;
160 
161 	op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]);
162     }
163 
164     /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
165     if (tb->CFR[0]) {
166 	struct key * key;
167 
168 	key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]);
169 	if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE ||
170 						       vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1))
171 		vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE;
172 
173 #ifdef CONFIG_REISERFS_CHECK
174 	if (op_is_left_mergeable (key, Sh->b_size) &&
175 	    !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) {
176 	    /* we delete last item and it could be merged with right neighbor's first item */
177 	    if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) &&
178 		  I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) {
179 		/* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
180 		print_block (Sh, 0, -1, -1);
181 		reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
182 				key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE);
183 	    } else
184 		/* we can delete directory item, that has only one directory entry in it */
185 		;
186 	}
187 #endif
188 
189     }
190 }
191 
192 
193 /* using virtual node check, how many items can be shifted to left
194    neighbor */
check_left(struct tree_balance * tb,int h,int cur_free)195 static void check_left (struct tree_balance * tb, int h, int cur_free)
196 {
197     int i;
198     struct virtual_node * vn = tb->tb_vn;
199     struct virtual_item * vi;
200     int d_size, ih_size;
201 
202     RFALSE( cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
203 
204     /* internal level */
205     if (h > 0) {
206 	tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
207 	return;
208     }
209 
210     /* leaf level */
211 
212     if (!cur_free || !vn->vn_nr_item) {
213 	/* no free space or nothing to move */
214 	tb->lnum[h] = 0;
215 	tb->lbytes = -1;
216 	return;
217     }
218 
219     RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
220 	    "vs-8055: parent does not exist or invalid");
221 
222     vi = vn->vn_vi;
223     if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
224 	/* all contents of S[0] fits into L[0] */
225 
226 	RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
227 		"vs-8055: invalid mode or balance condition failed");
228 
229 	tb->lnum[0] = vn->vn_nr_item;
230 	tb->lbytes = -1;
231 	return;
232     }
233 
234 
235     d_size = 0, ih_size = IH_SIZE;
236 
237     /* first item may be merge with last item in left neighbor */
238     if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
239 	d_size = -((int)IH_SIZE), ih_size = 0;
240 
241     tb->lnum[0] = 0;
242     for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) {
243 	d_size += vi->vi_item_len;
244 	if (cur_free >= d_size) {
245 	    /* the item can be shifted entirely */
246 	    cur_free -= d_size;
247 	    tb->lnum[0] ++;
248 	    continue;
249 	}
250 
251 	/* the item cannot be shifted entirely, try to split it */
252 	/* check whether L[0] can hold ih and at least one byte of the item body */
253 	if (cur_free <= ih_size) {
254 	    /* cannot shift even a part of the current item */
255 	    tb->lbytes = -1;
256 	    return;
257 	}
258 	cur_free -= ih_size;
259 
260 	tb->lbytes = op_check_left (vi, cur_free, 0, 0);
261 	if (tb->lbytes != -1)
262 	    /* count partially shifted item */
263 	    tb->lnum[0] ++;
264 
265 	break;
266     }
267 
268     return;
269 }
270 
271 
272 /* using virtual node check, how many items can be shifted to right
273    neighbor */
check_right(struct tree_balance * tb,int h,int cur_free)274 static void check_right (struct tree_balance * tb, int h, int cur_free)
275 {
276     int i;
277     struct virtual_node * vn = tb->tb_vn;
278     struct virtual_item * vi;
279     int d_size, ih_size;
280 
281     RFALSE( cur_free < 0, "vs-8070: cur_free < 0");
282 
283     /* internal level */
284     if (h > 0) {
285 	tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
286 	return;
287     }
288 
289     /* leaf level */
290 
291     if (!cur_free || !vn->vn_nr_item) {
292 	/* no free space  */
293 	tb->rnum[h] = 0;
294 	tb->rbytes = -1;
295 	return;
296     }
297 
298     RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
299 	    "vs-8075: parent does not exist or invalid");
300 
301     vi = vn->vn_vi + vn->vn_nr_item - 1;
302     if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
303 	/* all contents of S[0] fits into R[0] */
304 
305 	RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
306 		"vs-8080: invalid mode or balance condition failed");
307 
308 	tb->rnum[h] = vn->vn_nr_item;
309 	tb->rbytes = -1;
310 	return;
311     }
312 
313     d_size = 0, ih_size = IH_SIZE;
314 
315     /* last item may be merge with first item in right neighbor */
316     if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
317 	d_size = -(int)IH_SIZE, ih_size = 0;
318 
319     tb->rnum[0] = 0;
320     for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) {
321 	d_size += vi->vi_item_len;
322 	if (cur_free >= d_size) {
323 	    /* the item can be shifted entirely */
324 	    cur_free -= d_size;
325 	    tb->rnum[0] ++;
326 	    continue;
327 	}
328 
329 	/* check whether R[0] can hold ih and at least one byte of the item body */
330 	if ( cur_free <= ih_size ) {    /* cannot shift even a part of the current item */
331 	    tb->rbytes = -1;
332 	    return;
333 	}
334 
335 	/* R[0] can hold the header of the item and at least one byte of its body */
336 	cur_free -= ih_size;	/* cur_free is still > 0 */
337 
338 	tb->rbytes = op_check_right (vi, cur_free);
339 	if (tb->rbytes != -1)
340 	    /* count partially shifted item */
341 	    tb->rnum[0] ++;
342 
343 	break;
344     }
345 
346   return;
347 }
348 
349 
350 /*
351  * from - number of items, which are shifted to left neighbor entirely
352  * to - number of item, which are shifted to right neighbor entirely
353  * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
354  * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
get_num_ver(int mode,struct tree_balance * tb,int h,int from,int from_bytes,int to,int to_bytes,short * snum012,int flow)355 static int get_num_ver (int mode, struct tree_balance * tb, int h,
356 			int from, int from_bytes,
357 			int to,   int to_bytes,
358 			short * snum012, int flow
359     )
360 {
361     int i;
362     int cur_free;
363     //    int bytes;
364     int units;
365     struct virtual_node * vn = tb->tb_vn;
366     //    struct virtual_item * vi;
367 
368     int total_node_size, max_node_size, current_item_size;
369     int needed_nodes;
370     int start_item, 	/* position of item we start filling node from */
371 	end_item,	/* position of item we finish filling node by */
372 	start_bytes,/* number of first bytes (entries for directory) of start_item-th item
373 		       we do not include into node that is being filled */
374 	end_bytes;	/* number of last bytes (entries for directory) of end_item-th item
375 			   we do node include into node that is being filled */
376     int split_item_positions[2]; /* these are positions in virtual item of
377 				    items, that are split between S[0] and
378 				    S1new and S1new and S2new */
379 
380     split_item_positions[0] = -1;
381     split_item_positions[1] = -1;
382 
383     /* We only create additional nodes if we are in insert or paste mode
384        or we are in replace mode at the internal level. If h is 0 and
385        the mode is M_REPLACE then in fix_nodes we change the mode to
386        paste or insert before we get here in the code.  */
387     RFALSE( tb->insert_size[h] < 0  || (mode != M_INSERT && mode != M_PASTE),
388 	    "vs-8100: insert_size < 0 in overflow");
389 
390     max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h));
391 
392     /* snum012 [0-2] - number of items, that lay
393        to S[0], first new node and second new node */
394     snum012[3] = -1;	/* s1bytes */
395     snum012[4] = -1;	/* s2bytes */
396 
397     /* internal level */
398     if (h > 0) {
399 	i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
400 	if (i == max_node_size)
401 	    return 1;
402 	return (i / max_node_size + 1);
403     }
404 
405     /* leaf level */
406     needed_nodes = 1;
407     total_node_size = 0;
408     cur_free = max_node_size;
409 
410     // start from 'from'-th item
411     start_item = from;
412     // skip its first 'start_bytes' units
413     start_bytes = ((from_bytes != -1) ? from_bytes : 0);
414 
415     // last included item is the 'end_item'-th one
416     end_item = vn->vn_nr_item - to - 1;
417     // do not count last 'end_bytes' units of 'end_item'-th item
418     end_bytes = (to_bytes != -1) ? to_bytes : 0;
419 
420     /* go through all item beginning from the start_item-th item and ending by
421        the end_item-th item. Do not count first 'start_bytes' units of
422        'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
423 
424     for (i = start_item; i <= end_item; i ++) {
425 	struct virtual_item * vi = vn->vn_vi + i;
426 	int skip_from_end = ((i == end_item) ? end_bytes : 0);
427 
428 	RFALSE( needed_nodes > 3, "vs-8105: too many nodes are needed");
429 
430 	/* get size of current item */
431 	current_item_size = vi->vi_item_len;
432 
433 	/* do not take in calculation head part (from_bytes) of from-th item */
434 	current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes);
435 
436 	/* do not take in calculation tail part of last item */
437 	current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end);
438 
439 	/* if item fits into current node entierly */
440 	if (total_node_size + current_item_size <= max_node_size) {
441 	    snum012[needed_nodes - 1] ++;
442 	    total_node_size += current_item_size;
443 	    start_bytes = 0;
444 	    continue;
445 	}
446 
447 	if (current_item_size > max_node_size) {
448 	    /* virtual item length is longer, than max size of item in
449                a node. It is impossible for direct item */
450 	    RFALSE( is_direct_le_ih (vi->vi_ih),
451 		    "vs-8110: "
452 		    "direct item length is %d. It can not be longer than %d",
453 		    current_item_size, max_node_size);
454 	    /* we will try to split it */
455 	    flow = 1;
456 	}
457 
458 	if (!flow) {
459 	    /* as we do not split items, take new node and continue */
460 	    needed_nodes ++; i --; total_node_size = 0;
461 	    continue;
462 	}
463 
464 	// calculate number of item units which fit into node being
465 	// filled
466 	{
467 	    int free_space;
468 
469 	    free_space = max_node_size - total_node_size - IH_SIZE;
470 	    units = op_check_left (vi, free_space, start_bytes, skip_from_end);
471 	    if (units == -1) {
472 		/* nothing fits into current node, take new node and continue */
473 		needed_nodes ++, i--, total_node_size = 0;
474 		continue;
475 	    }
476 	}
477 
478 	/* something fits into the current node */
479 	//if (snum012[3] != -1 || needed_nodes != 1)
480 	//  reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
481 	//snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
482 	start_bytes += units;
483 	snum012[needed_nodes - 1 + 3] = units;
484 
485 	if (needed_nodes > 2)
486 	    reiserfs_warning (tb->tb_sb, "vs-8111: get_num_ver: split_item_position is out of boundary\n");
487 	snum012[needed_nodes - 1] ++;
488 	split_item_positions[needed_nodes - 1] = i;
489 	needed_nodes ++;
490 	/* continue from the same item with start_bytes != -1 */
491 	start_item = i;
492 	i --;
493 	total_node_size = 0;
494     }
495 
496     // sum012[4] (if it is not -1) contains number of units of which
497     // are to be in S1new, snum012[3] - to be in S0. They are supposed
498     // to be S1bytes and S2bytes correspondingly, so recalculate
499     if (snum012[4] > 0) {
500 	int split_item_num;
501 	int bytes_to_r, bytes_to_l;
502 	int bytes_to_S1new;
503 
504 	split_item_num = split_item_positions[1];
505 	bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
506 	bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
507 	bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0);
508 
509 	// s2bytes
510 	snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new;
511 
512 	if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY)
513 	    reiserfs_warning (tb->tb_sb, "vs-8115: get_num_ver: not directory item\n");
514     }
515 
516     /* now we know S2bytes, calculate S1bytes */
517     if (snum012[3] > 0) {
518 	int split_item_num;
519 	int bytes_to_r, bytes_to_l;
520 	int bytes_to_S2new;
521 
522 	split_item_num = split_item_positions[0];
523 	bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
524 	bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
525 	bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0);
526 
527 	// s1bytes
528 	snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new;
529     }
530 
531     return needed_nodes;
532 }
533 
534 
535 #ifdef CONFIG_REISERFS_CHECK
536 extern struct tree_balance * cur_tb;
537 #endif
538 
539 
540 /* Set parameters for balancing.
541  * Performs write of results of analysis of balancing into structure tb,
542  * where it will later be used by the functions that actually do the balancing.
543  * Parameters:
544  *	tb	tree_balance structure;
545  *	h	current level of the node;
546  *	lnum	number of items from S[h] that must be shifted to L[h];
547  *	rnum	number of items from S[h] that must be shifted to R[h];
548  *	blk_num	number of blocks that S[h] will be splitted into;
549  *	s012	number of items that fall into splitted nodes.
550  *	lbytes	number of bytes which flow to the left neighbor from the item that is not
551  *		not shifted entirely
552  *	rbytes	number of bytes which flow to the right neighbor from the item that is not
553  *		not shifted entirely
554  *	s1bytes	number of bytes which flow to the first  new node when S[0] splits (this number is contained in s012 array)
555  */
556 
set_parameters(struct tree_balance * tb,int h,int lnum,int rnum,int blk_num,short * s012,int lb,int rb)557 static void set_parameters (struct tree_balance * tb, int h, int lnum,
558 			    int rnum, int blk_num, short * s012, int lb, int rb)
559 {
560 
561   tb->lnum[h] = lnum;
562   tb->rnum[h] = rnum;
563   tb->blknum[h] = blk_num;
564 
565   if (h == 0)
566     {  /* only for leaf level */
567       if (s012 != NULL)
568 	{
569 	  tb->s0num = * s012 ++,
570 	  tb->s1num = * s012 ++,
571 	  tb->s2num = * s012 ++;
572 	  tb->s1bytes = * s012 ++;
573 	  tb->s2bytes = * s012;
574 	}
575       tb->lbytes = lb;
576       tb->rbytes = rb;
577     }
578   PROC_INFO_ADD( tb -> tb_sb, lnum[ h ], lnum );
579   PROC_INFO_ADD( tb -> tb_sb, rnum[ h ], rnum );
580 
581   PROC_INFO_ADD( tb -> tb_sb, lbytes[ h ], lb );
582   PROC_INFO_ADD( tb -> tb_sb, rbytes[ h ], rb );
583 }
584 
585 
586 
587 /* check, does node disappear if we shift tb->lnum[0] items to left
588    neighbor and tb->rnum[0] to the right one. */
is_leaf_removable(struct tree_balance * tb)589 static int is_leaf_removable (struct tree_balance * tb)
590 {
591   struct virtual_node * vn = tb->tb_vn;
592   int to_left, to_right;
593   int size;
594   int remain_items;
595 
596   /* number of items, that will be shifted to left (right) neighbor
597      entirely */
598   to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
599   to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
600   remain_items = vn->vn_nr_item;
601 
602   /* how many items remain in S[0] after shiftings to neighbors */
603   remain_items -= (to_left + to_right);
604 
605   if (remain_items < 1) {
606     /* all content of node can be shifted to neighbors */
607     set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1);
608     return 1;
609   }
610 
611   if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
612     /* S[0] is not removable */
613     return 0;
614 
615   /* check, whether we can divide 1 remaining item between neighbors */
616 
617   /* get size of remaining item (in item units) */
618   size = op_unit_num (&(vn->vn_vi[to_left]));
619 
620   if (tb->lbytes + tb->rbytes >= size) {
621     set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1);
622     return 1;
623   }
624 
625   return 0;
626 }
627 
628 
629 /* check whether L, S, R can be joined in one node */
are_leaves_removable(struct tree_balance * tb,int lfree,int rfree)630 static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree)
631 {
632   struct virtual_node * vn = tb->tb_vn;
633   int ih_size;
634   struct buffer_head *S0;
635 
636   S0 = PATH_H_PBUFFER (tb->tb_path, 0);
637 
638   ih_size = 0;
639   if (vn->vn_nr_item) {
640     if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
641       ih_size += IH_SIZE;
642 
643 	if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE)
644 	    ih_size += IH_SIZE;
645     } else {
646 	/* there was only one item and it will be deleted */
647 	struct item_head * ih;
648 
649     RFALSE( B_NR_ITEMS (S0) != 1,
650 	    "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0));
651 
652     ih = B_N_PITEM_HEAD (S0, 0);
653     if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])))
654 	if (is_direntry_le_ih (ih)) {
655 	    /* Directory must be in correct state here: that is
656 	       somewhere at the left side should exist first directory
657 	       item. But the item being deleted can not be that first
658 	       one because its right neighbor is item of the same
659 	       directory. (But first item always gets deleted in last
660 	       turn). So, neighbors of deleted item can be merged, so
661 	       we can save ih_size */
662 	    ih_size = IH_SIZE;
663 
664 	    /* we might check that left neighbor exists and is of the
665 	       same directory */
666 	    RFALSE(le_ih_k_offset (ih) == DOT_OFFSET,
667 		"vs-8130: first directory item can not be removed until directory is not empty");
668       }
669 
670   }
671 
672   if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) {
673     set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1);
674     PROC_INFO_INC( tb -> tb_sb, leaves_removable );
675     return 1;
676   }
677   return 0;
678 
679 }
680 
681 
682 
683 /* when we do not split item, lnum and rnum are numbers of entire items */
684 #define SET_PAR_SHIFT_LEFT \
685 if (h)\
686 {\
687    int to_l;\
688    \
689    to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
690 	      (MAX_NR_KEY(Sh) + 1 - lpar);\
691 	      \
692 	      set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
693 }\
694 else \
695 {\
696    if (lset==LEFT_SHIFT_FLOW)\
697      set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
698 		     tb->lbytes, -1);\
699    else\
700      set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
701 		     -1, -1);\
702 }
703 
704 
705 #define SET_PAR_SHIFT_RIGHT \
706 if (h)\
707 {\
708    int to_r;\
709    \
710    to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
711    \
712    set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
713 }\
714 else \
715 {\
716    if (rset==RIGHT_SHIFT_FLOW)\
717      set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
718 		  -1, tb->rbytes);\
719    else\
720      set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
721 		  -1, -1);\
722 }
723 
724 
free_buffers_in_tb(struct tree_balance * p_s_tb)725 void free_buffers_in_tb (
726 		       struct tree_balance * p_s_tb
727 		       ) {
728   int n_counter;
729 
730   decrement_counters_in_path(p_s_tb->tb_path);
731 
732   for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) {
733     decrement_bcount(p_s_tb->L[n_counter]);
734     p_s_tb->L[n_counter] = NULL;
735     decrement_bcount(p_s_tb->R[n_counter]);
736     p_s_tb->R[n_counter] = NULL;
737     decrement_bcount(p_s_tb->FL[n_counter]);
738     p_s_tb->FL[n_counter] = NULL;
739     decrement_bcount(p_s_tb->FR[n_counter]);
740     p_s_tb->FR[n_counter] = NULL;
741     decrement_bcount(p_s_tb->CFL[n_counter]);
742     p_s_tb->CFL[n_counter] = NULL;
743     decrement_bcount(p_s_tb->CFR[n_counter]);
744     p_s_tb->CFR[n_counter] = NULL;
745   }
746 }
747 
748 
749 /* Get new buffers for storing new nodes that are created while balancing.
750  * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
751  *	        CARRY_ON - schedule didn't occur while the function worked;
752  *	        NO_DISK_SPACE - no disk space.
753  */
754 /* The function is NOT SCHEDULE-SAFE! */
get_empty_nodes(struct tree_balance * p_s_tb,int n_h)755 static int  get_empty_nodes(
756               struct tree_balance * p_s_tb,
757               int n_h
758             ) {
759   struct buffer_head  * p_s_new_bh,
760     		      *	p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h);
761   unsigned long	      *	p_n_blocknr,
762     			a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, };
763   int       		n_counter,
764    			n_number_of_freeblk,
765                 	n_amount_needed,/* number of needed empty blocks */
766 			n_retval = CARRY_ON;
767   struct super_block *	p_s_sb = p_s_tb->tb_sb;
768 
769 
770   /* number_of_freeblk is the number of empty blocks which have been
771      acquired for use by the balancing algorithm minus the number of
772      empty blocks used in the previous levels of the analysis,
773      number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
774      after empty blocks are acquired, and the balancing analysis is
775      then restarted, amount_needed is the number needed by this level
776      (n_h) of the balancing analysis.
777 
778      Note that for systems with many processes writing, it would be
779      more layout optimal to calculate the total number needed by all
780      levels and then to run reiserfs_new_blocks to get all of them at once.  */
781 
782   /* Initiate number_of_freeblk to the amount acquired prior to the restart of
783      the analysis or 0 if not restarted, then subtract the amount needed
784      by all of the levels of the tree below n_h. */
785   /* blknum includes S[n_h], so we subtract 1 in this calculation */
786   for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ )
787     n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0;
788 
789   /* Allocate missing empty blocks. */
790   /* if p_s_Sh == 0  then we are getting a new root */
791   n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1;
792   /*  Amount_needed = the amount that we need more than the amount that we have. */
793   if ( n_amount_needed > n_number_of_freeblk )
794     n_amount_needed -= n_number_of_freeblk;
795   else /* If we have enough already then there is nothing to do. */
796     return CARRY_ON;
797 
798   if ( reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
799                                    n_amount_needed) == NO_DISK_SPACE )
800     return NO_DISK_SPACE;
801 
802   /* for each blocknumber we just got, get a buffer and stick it on FEB */
803   for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed;
804 	p_n_blocknr++, n_counter++ ) {
805 
806     RFALSE( ! *p_n_blocknr,
807 	    "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
808 
809     p_s_new_bh = getblk(p_s_sb->s_dev, *p_n_blocknr, p_s_sb->s_blocksize);
810     if (atomic_read (&(p_s_new_bh->b_count)) > 1) {
811 /*&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&*/
812 /*
813       reiserfs_warning (p_s_sb, "waiting for buffer %b, iput inode pid = %d, this pid %d, mode %c, %h\n",
814 			p_s_new_bh, put_inode_pid, current->pid, p_s_tb->tb_vn->vn_mode, p_s_tb->tb_vn->vn_ins_ih);
815       print_tb (0, 0, 0, p_s_tb, "tb");
816 */
817 /*&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&*/
818       if (atomic_read(&(p_s_new_bh->b_count)) > 2 ||
819           !(buffer_journaled(p_s_new_bh) || buffer_journal_dirty(p_s_new_bh))) {
820 	n_retval = REPEAT_SEARCH ;
821 	free_buffers_in_tb (p_s_tb);
822 	wait_buffer_until_released (p_s_new_bh);
823       }
824     }
825     RFALSE( (atomic_read (&(p_s_new_bh->b_count)) != 1 ||
826 	     buffer_dirty (p_s_new_bh)) &&
827 	    (atomic_read(&(p_s_new_bh->b_count)) > 2 ||
828 	     !(buffer_journaled(p_s_new_bh) ||
829 	       buffer_journal_dirty(p_s_new_bh))),
830 	    "PAP-8140: not free or dirty buffer %b for the new block",
831 	    p_s_new_bh);
832 
833     /* Put empty buffers into the array. */
834     if (p_s_tb->FEB[p_s_tb->cur_blknum])
835       BUG();
836 
837     mark_buffer_journal_new(p_s_new_bh) ;
838     p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
839   }
840 
841   if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) )
842     n_retval = REPEAT_SEARCH ;
843 
844   return n_retval;
845 }
846 
847 
848 /* Get free space of the left neighbor, which is stored in the parent
849  * node of the left neighbor.  */
get_lfree(struct tree_balance * tb,int h)850 static int get_lfree (struct tree_balance * tb, int h)
851 {
852     struct buffer_head * l, * f;
853     int order;
854 
855     if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
856 	return 0;
857 
858     if (f == l)
859 	order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1;
860     else {
861 	order = B_NR_ITEMS (l);
862 	f = l;
863     }
864 
865     return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f,order)));
866 }
867 
868 
869 /* Get free space of the right neighbor,
870  * which is stored in the parent node of the right neighbor.
871  */
get_rfree(struct tree_balance * tb,int h)872 static int get_rfree (struct tree_balance * tb, int h)
873 {
874   struct buffer_head * r, * f;
875   int order;
876 
877   if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
878     return 0;
879 
880   if (f == r)
881       order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1;
882   else {
883       order = 0;
884       f = r;
885   }
886 
887   return (MAX_CHILD_SIZE(f) - dc_size( B_N_CHILD(f,order)));
888 
889 }
890 
891 
892 /* Check whether left neighbor is in memory. */
is_left_neighbor_in_cache(struct tree_balance * p_s_tb,int n_h)893 static int  is_left_neighbor_in_cache(
894               struct tree_balance * p_s_tb,
895               int                   n_h
896             ) {
897   struct buffer_head  * p_s_father, * left;
898   struct super_block  * p_s_sb = p_s_tb->tb_sb;
899   unsigned long         n_left_neighbor_blocknr;
900   int                   n_left_neighbor_position;
901 
902   if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */
903     return 0;
904 
905   /* Calculate father of the node to be balanced. */
906   p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
907 
908   RFALSE( ! p_s_father ||
909 	  ! B_IS_IN_TREE (p_s_father) ||
910 	  ! B_IS_IN_TREE (p_s_tb->FL[n_h]) ||
911 	  ! buffer_uptodate (p_s_father) ||
912 	  ! buffer_uptodate (p_s_tb->FL[n_h]),
913 	  "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
914 	  p_s_father, p_s_tb->FL[n_h]);
915 
916 
917   /* Get position of the pointer to the left neighbor into the left father. */
918   n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ?
919                       p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
920   /* Get left neighbor block number. */
921   n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
922   /* Look for the left neighbor in the cache. */
923   if ( (left = sb_get_hash_table(p_s_sb, n_left_neighbor_blocknr)) ) {
924 
925     RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left),
926 	    "vs-8170: left neighbor (%b %z) is not in the tree", left, left);
927     put_bh(left) ;
928     return 1;
929   }
930 
931   return 0;
932 }
933 
934 
935 #define LEFT_PARENTS  'l'
936 #define RIGHT_PARENTS 'r'
937 
938 
decrement_key(struct cpu_key * p_s_key)939 static void decrement_key (struct cpu_key * p_s_key)
940 {
941     // call item specific function for this key
942     item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key);
943 }
944 
945 
946 
947 
948 /* Calculate far left/right parent of the left/right neighbor of the current node, that
949  * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
950  * Calculate left/right common parent of the current node and L[h]/R[h].
951  * Calculate left/right delimiting key position.
952  * Returns:	PATH_INCORRECT   - path in the tree is not correct;
953  		SCHEDULE_OCCURRED - schedule occurred while the function worked;
954  *	        CARRY_ON         - schedule didn't occur while the function worked;
955  */
get_far_parent(struct tree_balance * p_s_tb,int n_h,struct buffer_head ** pp_s_father,struct buffer_head ** pp_s_com_father,char c_lr_par)956 static int  get_far_parent (struct tree_balance *   p_s_tb,
957 			    int                     n_h,
958 			    struct buffer_head  **  pp_s_father,
959 			    struct buffer_head  **  pp_s_com_father,
960 			    char                    c_lr_par)
961 {
962     struct buffer_head  * p_s_parent;
963     INITIALIZE_PATH (s_path_to_neighbor_father);
964     struct path * p_s_path = p_s_tb->tb_path;
965     struct cpu_key	s_lr_father_key;
966     int                   n_counter,
967 	n_position = INT_MAX,
968 	n_first_last_position = 0,
969 	n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
970 
971     /* Starting from F[n_h] go upwards in the tree, and look for the common
972       ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
973 
974     n_counter = n_path_offset;
975 
976     RFALSE( n_counter < FIRST_PATH_ELEMENT_OFFSET,
977 	    "PAP-8180: invalid path length");
978 
979 
980     for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--  )  {
981 	/* Check whether parent of the current buffer in the path is really parent in the tree. */
982 	if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) )
983 	    return REPEAT_SEARCH;
984 	/* Check whether position in the parent is correct. */
985 	if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) )
986 	    return REPEAT_SEARCH;
987 	/* Check whether parent at the path really points to the child. */
988 	if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
989 	     PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr )
990 	    return REPEAT_SEARCH;
991 	/* Return delimiting key if position in the parent is not equal to first/last one. */
992 	if ( c_lr_par == RIGHT_PARENTS )
993 	    n_first_last_position = B_NR_ITEMS (p_s_parent);
994 	if ( n_position != n_first_last_position ) {
995 	    *pp_s_com_father = p_s_parent;
996 	    get_bh(*pp_s_com_father) ;
997 	    /*(*pp_s_com_father = p_s_parent)->b_count++;*/
998 	    break;
999 	}
1000     }
1001 
1002     /* if we are in the root of the tree, then there is no common father */
1003     if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) {
1004 	/* Check whether first buffer in the path is the root of the tree. */
1005 	if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1006 	     SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
1007 	    *pp_s_father = *pp_s_com_father = NULL;
1008 	    return CARRY_ON;
1009 	}
1010 	return REPEAT_SEARCH;
1011     }
1012 
1013     RFALSE( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
1014 	    "PAP-8185: (%b %z) level too small",
1015 	    *pp_s_com_father, *pp_s_com_father);
1016 
1017     /* Check whether the common parent is locked. */
1018 
1019     if ( buffer_locked (*pp_s_com_father) ) {
1020 	__wait_on_buffer(*pp_s_com_father);
1021 	if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1022 	    decrement_bcount(*pp_s_com_father);
1023 	    return REPEAT_SEARCH;
1024 	}
1025     }
1026 
1027     /* So, we got common parent of the current node and its left/right neighbor.
1028      Now we are geting the parent of the left/right neighbor. */
1029 
1030     /* Form key to get parent of the left/right neighbor. */
1031     le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ?
1032 						     (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position)));
1033 
1034 
1035     if ( c_lr_par == LEFT_PARENTS )
1036 	decrement_key(&s_lr_father_key);
1037 
1038     if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR)
1039 	// path is released
1040 	return IO_ERROR;
1041 
1042     if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1043 	decrement_counters_in_path(&s_path_to_neighbor_father);
1044 	decrement_bcount(*pp_s_com_father);
1045 	return REPEAT_SEARCH;
1046     }
1047 
1048     *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1049 
1050     RFALSE( B_LEVEL (*pp_s_father) != n_h + 1,
1051 	    "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
1052     RFALSE( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET,
1053 	    "PAP-8192: path length is too small");
1054 
1055     s_path_to_neighbor_father.path_length--;
1056     decrement_counters_in_path(&s_path_to_neighbor_father);
1057     return CARRY_ON;
1058 }
1059 
1060 
1061 /* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
1062  * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
1063  * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
1064  * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
1065  * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
1066  *	        CARRY_ON - schedule didn't occur while the function worked;
1067  */
get_parents(struct tree_balance * p_s_tb,int n_h)1068 static int  get_parents (struct tree_balance * p_s_tb, int n_h)
1069 {
1070     struct path         * p_s_path = p_s_tb->tb_path;
1071     int                   n_position,
1072 	n_ret_value,
1073 	n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1074     struct buffer_head  * p_s_curf,
1075 	* p_s_curcf;
1076 
1077     /* Current node is the root of the tree or will be root of the tree */
1078     if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
1079 	/* The root can not have parents.
1080 	   Release nodes which previously were obtained as parents of the current node neighbors. */
1081 	decrement_bcount(p_s_tb->FL[n_h]);
1082 	decrement_bcount(p_s_tb->CFL[n_h]);
1083 	decrement_bcount(p_s_tb->FR[n_h]);
1084 	decrement_bcount(p_s_tb->CFR[n_h]);
1085 	p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL;
1086 	return CARRY_ON;
1087     }
1088 
1089     /* Get parent FL[n_path_offset] of L[n_path_offset]. */
1090     if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) )  {
1091 	/* Current node is not the first child of its parent. */
1092 	/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
1093 	p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1094 	get_bh(p_s_curf) ;
1095 	get_bh(p_s_curf) ;
1096 	p_s_tb->lkey[n_h] = n_position - 1;
1097     }
1098     else  {
1099 	/* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
1100 	   Calculate current common parent of L[n_path_offset] and the current node. Note that
1101 	   CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
1102 	   Calculate lkey[n_path_offset]. */
1103 	if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
1104 					   &p_s_curcf, LEFT_PARENTS)) != CARRY_ON )
1105 	    return n_ret_value;
1106     }
1107 
1108     decrement_bcount(p_s_tb->FL[n_h]);
1109     p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */
1110     decrement_bcount(p_s_tb->CFL[n_h]);
1111     p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */
1112 
1113     RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
1114 	    (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
1115 	    "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
1116 
1117 /* Get parent FR[n_h] of R[n_h]. */
1118 
1119 /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
1120     if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) {
1121 /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
1122    Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
1123    not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
1124 	if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,  &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON )
1125 	    return n_ret_value;
1126     }
1127     else {
1128 /* Current node is not the last child of its parent F[n_h]. */
1129 	/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
1130 	p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1131 	get_bh(p_s_curf) ;
1132 	get_bh(p_s_curf) ;
1133 	p_s_tb->rkey[n_h] = n_position;
1134     }
1135 
1136     decrement_bcount(p_s_tb->FR[n_h]);
1137     p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */
1138 
1139     decrement_bcount(p_s_tb->CFR[n_h]);
1140     p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */
1141 
1142     RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
1143             (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
1144 	    "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
1145 
1146     return CARRY_ON;
1147 }
1148 
1149 
1150 /* it is possible to remove node as result of shiftings to
1151    neighbors even when we insert or paste item. */
can_node_be_removed(int mode,int lfree,int sfree,int rfree,struct tree_balance * tb,int h)1152 static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h)
1153 {
1154     struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h);
1155     int levbytes = tb->insert_size[h];
1156     struct item_head * ih;
1157     struct key * r_key = NULL;
1158 
1159     ih = B_N_PITEM_HEAD (Sh, 0);
1160     if ( tb->CFR[h] )
1161 	r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]);
1162 
1163     if (
1164 	lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1165 	/* shifting may merge items which might save space */
1166 	- (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0)
1167 	- (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0)
1168 	+ (( h ) ? KEY_SIZE : 0))
1169     {
1170 	/* node can not be removed */
1171 	if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */
1172 	    if ( ! h )
1173 		tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0);
1174 	    set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1175 	    return NO_BALANCING_NEEDED;
1176 	}
1177     }
1178     PROC_INFO_INC( tb -> tb_sb, can_node_be_removed[ h ] );
1179     return !NO_BALANCING_NEEDED;
1180 }
1181 
1182 
1183 
1184 /* Check whether current node S[h] is balanced when increasing its size by
1185  * Inserting or Pasting.
1186  * Calculate parameters for balancing for current level h.
1187  * Parameters:
1188  *	tb	tree_balance structure;
1189  *	h	current level of the node;
1190  *	inum	item number in S[h];
1191  *	mode	i - insert, p - paste;
1192  * Returns:	1 - schedule occurred;
1193  *	        0 - balancing for higher levels needed;
1194  *	       -1 - no balancing for higher levels needed;
1195  *	       -2 - no disk space.
1196  */
1197 /* ip means Inserting or Pasting */
ip_check_balance(struct tree_balance * tb,int h)1198 static int ip_check_balance (struct tree_balance * tb, int h)
1199 {
1200     struct virtual_node * vn = tb->tb_vn;
1201     int levbytes,  /* Number of bytes that must be inserted into (value
1202 		      is negative if bytes are deleted) buffer which
1203 		      contains node being balanced.  The mnemonic is
1204 		      that the attempted change in node space used level
1205 		      is levbytes bytes. */
1206 	n_ret_value;
1207 
1208     int lfree, sfree, rfree /* free space in L, S and R */;
1209 
1210     /* nver is short for number of vertixes, and lnver is the number if
1211        we shift to the left, rnver is the number if we shift to the
1212        right, and lrnver is the number if we shift in both directions.
1213        The goal is to minimize first the number of vertixes, and second,
1214        the number of vertixes whose contents are changed by shifting,
1215        and third the number of uncached vertixes whose contents are
1216        changed by shifting and must be read from disk.  */
1217     int nver, lnver, rnver, lrnver;
1218 
1219     /* used at leaf level only, S0 = S[0] is the node being balanced,
1220        sInum [ I = 0,1,2 ] is the number of items that will
1221        remain in node SI after balancing.  S1 and S2 are new
1222        nodes that might be created. */
1223 
1224     /* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
1225        where 4th parameter is s1bytes and 5th - s2bytes
1226     */
1227     short snum012[40] = {0,};	/* s0num, s1num, s2num for 8 cases
1228 				   0,1 - do not shift and do not shift but bottle
1229 				   2 - shift only whole item to left
1230 				   3 - shift to left and bottle as much as possible
1231 				   4,5 - shift to right	(whole items and as much as possible
1232 				   6,7 - shift to both directions (whole items and as much as possible)
1233 				*/
1234 
1235     /* Sh is the node whose balance is currently being checked */
1236     struct buffer_head * Sh;
1237 
1238     Sh = PATH_H_PBUFFER (tb->tb_path, h);
1239     levbytes = tb->insert_size[h];
1240 
1241     /* Calculate balance parameters for creating new root. */
1242     if ( ! Sh )  {
1243 	if ( ! h )
1244 	    reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0");
1245 	switch ( n_ret_value = get_empty_nodes (tb, h) )  {
1246 	case CARRY_ON:
1247 	    set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1248 	    return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1249 
1250 	case NO_DISK_SPACE:
1251 	case REPEAT_SEARCH:
1252 	    return n_ret_value;
1253 	default:
1254 	    reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
1255 	}
1256     }
1257 
1258     if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */
1259 	return n_ret_value;
1260 
1261     sfree = B_FREE_SPACE (Sh);
1262 
1263     /* get free space of neighbors */
1264     rfree = get_rfree (tb, h);
1265     lfree = get_lfree (tb, h);
1266 
1267     if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED)
1268 	/* and new item fits into node S[h] without any shifting */
1269 	return NO_BALANCING_NEEDED;
1270 
1271     create_virtual_node (tb, h);
1272 
1273     /*
1274 	determine maximal number of items we can shift to the left neighbor (in tb structure)
1275 	and the maximal number of bytes that can flow to the left neighbor
1276 	from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1277     */
1278     check_left (tb, h, lfree);
1279 
1280     /*
1281       determine maximal number of items we can shift to the right neighbor (in tb structure)
1282       and the maximal number of bytes that can flow to the right neighbor
1283       from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1284     */
1285     check_right (tb, h, rfree);
1286 
1287 
1288     /* all contents of internal node S[h] can be moved into its
1289        neighbors, S[h] will be removed after balancing */
1290     if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1291 	int to_r;
1292 
1293 	/* Since we are working on internal nodes, and our internal
1294 	   nodes have fixed size entries, then we can balance by the
1295 	   number of items rather than the space they consume.  In this
1296 	   routine we set the left node equal to the right node,
1297 	   allowing a difference of less than or equal to 1 child
1298 	   pointer. */
1299 	to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1300 	    (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1301 	set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1302 	return CARRY_ON;
1303     }
1304 
1305     /* this checks balance condition, that any two neighboring nodes can not fit in one node */
1306     RFALSE( h &&
1307 	    ( tb->lnum[h] >= vn->vn_nr_item + 1 ||
1308 	      tb->rnum[h] >= vn->vn_nr_item + 1),
1309 	    "vs-8220: tree is not balanced on internal level");
1310     RFALSE( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1311 		    (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) ),
1312 	    "vs-8225: tree is not balanced on leaf level");
1313 
1314     /* all contents of S[0] can be moved into its neighbors
1315        S[0] will be removed after balancing. */
1316     if (!h && is_leaf_removable (tb))
1317 	return CARRY_ON;
1318 
1319 
1320     /* why do we perform this check here rather than earlier??
1321        Answer: we can win 1 node in some cases above. Moreover we
1322        checked it above, when we checked, that S[0] is not removable
1323        in principle */
1324     if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
1325 	if ( ! h )
1326 	    tb->s0num = vn->vn_nr_item;
1327 	set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1328 	return NO_BALANCING_NEEDED;
1329     }
1330 
1331 
1332     {
1333 	int lpar, rpar, nset, lset, rset, lrset;
1334 	/*
1335 	 * regular overflowing of the node
1336 	 */
1337 
1338 	/* get_num_ver works in 2 modes (FLOW & NO_FLOW)
1339 	   lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1340 	   nset, lset, rset, lrset - shows, whether flowing items give better packing
1341 	*/
1342 #define FLOW 1
1343 #define NO_FLOW 0	/* do not any splitting */
1344 
1345 	/* we choose one the following */
1346 #define NOTHING_SHIFT_NO_FLOW	0
1347 #define NOTHING_SHIFT_FLOW	5
1348 #define LEFT_SHIFT_NO_FLOW	10
1349 #define LEFT_SHIFT_FLOW		15
1350 #define RIGHT_SHIFT_NO_FLOW	20
1351 #define RIGHT_SHIFT_FLOW	25
1352 #define LR_SHIFT_NO_FLOW	30
1353 #define LR_SHIFT_FLOW		35
1354 
1355 
1356 	lpar = tb->lnum[h];
1357 	rpar = tb->rnum[h];
1358 
1359 
1360 	/* calculate number of blocks S[h] must be split into when
1361 	   nothing is shifted to the neighbors,
1362 	   as well as number of items in each part of the split node (s012 numbers),
1363 	   and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1364 	nset = NOTHING_SHIFT_NO_FLOW;
1365 	nver = get_num_ver (vn->vn_mode, tb, h,
1366 			    0, -1, h?vn->vn_nr_item:0, -1,
1367 			    snum012, NO_FLOW);
1368 
1369 	if (!h)
1370 	{
1371 	    int nver1;
1372 
1373 	    /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1374 	    nver1 = get_num_ver (vn->vn_mode, tb, h,
1375 				 0, -1, 0, -1,
1376 				 snum012 + NOTHING_SHIFT_FLOW, FLOW);
1377 	    if (nver > nver1)
1378 		nset = NOTHING_SHIFT_FLOW, nver = nver1;
1379 	}
1380 
1381 
1382 	/* calculate number of blocks S[h] must be split into when
1383 	   l_shift_num first items and l_shift_bytes of the right most
1384 	   liquid item to be shifted are shifted to the left neighbor,
1385 	   as well as number of items in each part of the splitted node (s012 numbers),
1386 	   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1387 	*/
1388 	lset = LEFT_SHIFT_NO_FLOW;
1389 	lnver = get_num_ver (vn->vn_mode, tb, h,
1390 			     lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1,
1391 			     snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1392 	if (!h)
1393 	{
1394 	    int lnver1;
1395 
1396 	    lnver1 = get_num_ver (vn->vn_mode, tb, h,
1397 				  lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1,
1398 				  snum012 + LEFT_SHIFT_FLOW, FLOW);
1399 	    if (lnver > lnver1)
1400 		lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1401 	}
1402 
1403 
1404 	/* calculate number of blocks S[h] must be split into when
1405 	   r_shift_num first items and r_shift_bytes of the left most
1406 	   liquid item to be shifted are shifted to the right neighbor,
1407 	   as well as number of items in each part of the splitted node (s012 numbers),
1408 	   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1409 	*/
1410 	rset = RIGHT_SHIFT_NO_FLOW;
1411 	rnver = get_num_ver (vn->vn_mode, tb, h,
1412 			     0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1,
1413 			     snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1414 	if (!h)
1415 	{
1416 	    int rnver1;
1417 
1418 	    rnver1 = get_num_ver (vn->vn_mode, tb, h,
1419 				  0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
1420 				  snum012 + RIGHT_SHIFT_FLOW, FLOW);
1421 
1422 	    if (rnver > rnver1)
1423 		rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1424 	}
1425 
1426 
1427 	/* calculate number of blocks S[h] must be split into when
1428 	   items are shifted in both directions,
1429 	   as well as number of items in each part of the splitted node (s012 numbers),
1430 	   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1431 	*/
1432 	lrset = LR_SHIFT_NO_FLOW;
1433 	lrnver = get_num_ver (vn->vn_mode, tb, h,
1434 			      lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1,
1435 			      snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1436 	if (!h)
1437 	{
1438 	    int lrnver1;
1439 
1440 	    lrnver1 = get_num_ver (vn->vn_mode, tb, h,
1441 				   lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
1442 				   snum012 + LR_SHIFT_FLOW, FLOW);
1443 	    if (lrnver > lrnver1)
1444 		lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1445 	}
1446 
1447 
1448 
1449 	/* Our general shifting strategy is:
1450 	   1) to minimized number of new nodes;
1451 	   2) to minimized number of neighbors involved in shifting;
1452 	   3) to minimized number of disk reads; */
1453 
1454 	/* we can win TWO or ONE nodes by shifting in both directions */
1455 	if (lrnver < lnver && lrnver < rnver)
1456 	{
1457 	    RFALSE( h &&
1458 		    (tb->lnum[h] != 1 ||
1459 		     tb->rnum[h] != 1 ||
1460 		     lrnver != 1 || rnver != 2 || lnver != 2 || h != 1),
1461 		    "vs-8230: bad h");
1462 	    if (lrset == LR_SHIFT_FLOW)
1463 		set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset,
1464 				tb->lbytes, tb->rbytes);
1465 	    else
1466 		set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1),
1467 				tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1);
1468 
1469 	    return CARRY_ON;
1470 	}
1471 
1472 	/* if shifting doesn't lead to better packing then don't shift */
1473 	if (nver == lrnver)
1474 	{
1475 	    set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1);
1476 	    return CARRY_ON;
1477 	}
1478 
1479 
1480 	/* now we know that for better packing shifting in only one
1481 	   direction either to the left or to the right is required */
1482 
1483 	/*  if shifting to the left is better than shifting to the right */
1484 	if (lnver < rnver)
1485 	{
1486 	    SET_PAR_SHIFT_LEFT;
1487 	    return CARRY_ON;
1488 	}
1489 
1490 	/* if shifting to the right is better than shifting to the left */
1491 	if (lnver > rnver)
1492 	{
1493 	    SET_PAR_SHIFT_RIGHT;
1494 	    return CARRY_ON;
1495 	}
1496 
1497 
1498 	/* now shifting in either direction gives the same number
1499 	   of nodes and we can make use of the cached neighbors */
1500 	if (is_left_neighbor_in_cache (tb,h))
1501 	{
1502 	    SET_PAR_SHIFT_LEFT;
1503 	    return CARRY_ON;
1504 	}
1505 
1506 	/* shift to the right independently on whether the right neighbor in cache or not */
1507 	SET_PAR_SHIFT_RIGHT;
1508 	return CARRY_ON;
1509     }
1510 }
1511 
1512 
1513 /* Check whether current node S[h] is balanced when Decreasing its size by
1514  * Deleting or Cutting for INTERNAL node of S+tree.
1515  * Calculate parameters for balancing for current level h.
1516  * Parameters:
1517  *	tb	tree_balance structure;
1518  *	h	current level of the node;
1519  *	inum	item number in S[h];
1520  *	mode	i - insert, p - paste;
1521  * Returns:	1 - schedule occurred;
1522  *	        0 - balancing for higher levels needed;
1523  *	       -1 - no balancing for higher levels needed;
1524  *	       -2 - no disk space.
1525  *
1526  * Note: Items of internal nodes have fixed size, so the balance condition for
1527  * the internal part of S+tree is as for the B-trees.
1528  */
dc_check_balance_internal(struct tree_balance * tb,int h)1529 static int dc_check_balance_internal (struct tree_balance * tb, int h)
1530 {
1531   struct virtual_node * vn = tb->tb_vn;
1532 
1533   /* Sh is the node whose balance is currently being checked,
1534      and Fh is its father.  */
1535   struct buffer_head * Sh, * Fh;
1536   int maxsize,
1537       n_ret_value;
1538   int lfree, rfree /* free space in L and R */;
1539 
1540   Sh = PATH_H_PBUFFER (tb->tb_path, h);
1541   Fh = PATH_H_PPARENT (tb->tb_path, h);
1542 
1543   maxsize = MAX_CHILD_SIZE(Sh);
1544 
1545 /*   using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1546 /*   new_nr_item = number of items node would have if operation is */
1547 /* 	performed without balancing (new_nr_item); */
1548   create_virtual_node (tb, h);
1549 
1550   if ( ! Fh )
1551     {   /* S[h] is the root. */
1552       if ( vn->vn_nr_item > 0 )
1553 	{
1554 	  set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1555 	  return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1556 	}
1557       /* new_nr_item == 0.
1558        * Current root will be deleted resulting in
1559        * decrementing the tree height. */
1560       set_parameters (tb, h, 0, 0, 0, NULL, -1, -1);
1561       return CARRY_ON;
1562     }
1563 
1564   if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
1565     return n_ret_value;
1566 
1567 
1568   /* get free space of neighbors */
1569   rfree = get_rfree (tb, h);
1570   lfree = get_lfree (tb, h);
1571 
1572   /* determine maximal number of items we can fit into neighbors */
1573   check_left (tb, h, lfree);
1574   check_right (tb, h, rfree);
1575 
1576 
1577   if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) )
1578     { /* Balance condition for the internal node is valid.
1579        * In this case we balance only if it leads to better packing. */
1580       if ( vn->vn_nr_item == MIN_NR_KEY(Sh) )
1581 	{ /* Here we join S[h] with one of its neighbors,
1582 	   * which is impossible with greater values of new_nr_item. */
1583 	  if ( tb->lnum[h] >= vn->vn_nr_item + 1 )
1584 	    {
1585 	      /* All contents of S[h] can be moved to L[h]. */
1586 	      int n;
1587 	      int order_L;
1588 
1589 	      order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1590 	      n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
1591 	      set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
1592 	      return CARRY_ON;
1593 	    }
1594 
1595 	  if ( tb->rnum[h] >= vn->vn_nr_item + 1 )
1596 	    {
1597 	      /* All contents of S[h] can be moved to R[h]. */
1598 	      int n;
1599 	      int order_R;
1600 
1601 	      order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1;
1602 	      n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
1603 	      set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
1604 	      return CARRY_ON;
1605 	    }
1606 	}
1607 
1608       if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
1609 	{
1610 	  /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1611 	  int to_r;
1612 
1613 	  to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1614 	    (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1615 	  set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1616 	  return CARRY_ON;
1617 	}
1618 
1619       /* Balancing does not lead to better packing. */
1620       set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1621       return NO_BALANCING_NEEDED;
1622     }
1623 
1624   /* Current node contain insufficient number of items. Balancing is required. */
1625   /* Check whether we can merge S[h] with left neighbor. */
1626   if (tb->lnum[h] >= vn->vn_nr_item + 1)
1627     if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h])
1628       {
1629 	int n;
1630 	int order_L;
1631 
1632 	order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1633 	n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
1634 	set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
1635 	return CARRY_ON;
1636       }
1637 
1638   /* Check whether we can merge S[h] with right neighbor. */
1639   if (tb->rnum[h] >= vn->vn_nr_item + 1)
1640     {
1641       int n;
1642       int order_R;
1643 
1644       order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1645       n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
1646       set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
1647       return CARRY_ON;
1648     }
1649 
1650   /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1651   if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
1652     {
1653       int to_r;
1654 
1655       to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1656 	(MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1657       set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1658       return CARRY_ON;
1659     }
1660 
1661   /* For internal nodes try to borrow item from a neighbor */
1662   RFALSE( !tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1663 
1664   /* Borrow one or two items from caching neighbor */
1665   if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h])
1666     {
1667       int from_l;
1668 
1669       from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 -  (vn->vn_nr_item + 1);
1670       set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1);
1671       return CARRY_ON;
1672     }
1673 
1674   set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1,
1675 		  NULL, -1, -1);
1676   return CARRY_ON;
1677 }
1678 
1679 
1680 /* Check whether current node S[h] is balanced when Decreasing its size by
1681  * Deleting or Truncating for LEAF node of S+tree.
1682  * Calculate parameters for balancing for current level h.
1683  * Parameters:
1684  *	tb	tree_balance structure;
1685  *	h	current level of the node;
1686  *	inum	item number in S[h];
1687  *	mode	i - insert, p - paste;
1688  * Returns:	1 - schedule occurred;
1689  *	        0 - balancing for higher levels needed;
1690  *	       -1 - no balancing for higher levels needed;
1691  *	       -2 - no disk space.
1692  */
dc_check_balance_leaf(struct tree_balance * tb,int h)1693 static int dc_check_balance_leaf (struct tree_balance * tb, int h)
1694 {
1695   struct virtual_node * vn = tb->tb_vn;
1696 
1697   /* Number of bytes that must be deleted from
1698      (value is negative if bytes are deleted) buffer which
1699      contains node being balanced.  The mnemonic is that the
1700      attempted change in node space used level is levbytes bytes. */
1701   int levbytes;
1702   /* the maximal item size */
1703   int maxsize,
1704       n_ret_value;
1705   /* S0 is the node whose balance is currently being checked,
1706      and F0 is its father.  */
1707   struct buffer_head * S0, * F0;
1708   int lfree, rfree /* free space in L and R */;
1709 
1710   S0 = PATH_H_PBUFFER (tb->tb_path, 0);
1711   F0 = PATH_H_PPARENT (tb->tb_path, 0);
1712 
1713   levbytes = tb->insert_size[h];
1714 
1715   maxsize = MAX_CHILD_SIZE(S0); 	/* maximal possible size of an item */
1716 
1717   if ( ! F0 )
1718     {  /* S[0] is the root now. */
1719 
1720       RFALSE( -levbytes >= maxsize - B_FREE_SPACE (S0),
1721 	      "vs-8240: attempt to create empty buffer tree");
1722 
1723       set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1724       return NO_BALANCING_NEEDED;
1725     }
1726 
1727   if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
1728     return n_ret_value;
1729 
1730   /* get free space of neighbors */
1731   rfree = get_rfree (tb, h);
1732   lfree = get_lfree (tb, h);
1733 
1734   create_virtual_node (tb, h);
1735 
1736   /* if 3 leaves can be merge to one, set parameters and return */
1737   if (are_leaves_removable (tb, lfree, rfree))
1738     return CARRY_ON;
1739 
1740   /* determine maximal number of items we can shift to the left/right  neighbor
1741      and the maximal number of bytes that can flow to the left/right neighbor
1742      from the left/right most liquid item that cannot be shifted from S[0] entirely
1743      */
1744   check_left (tb, h, lfree);
1745   check_right (tb, h, rfree);
1746 
1747   /* check whether we can merge S with left neighbor. */
1748   if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1749     if (is_left_neighbor_in_cache (tb,h) ||
1750 	((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */
1751 	!tb->FR[h]) {
1752 
1753       RFALSE( !tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1754 
1755       /* set parameter to merge S[0] with its left neighbor */
1756       set_parameters (tb, h, -1, 0, 0, NULL, -1, -1);
1757       return CARRY_ON;
1758     }
1759 
1760   /* check whether we can merge S[0] with right neighbor. */
1761   if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1762     set_parameters (tb, h, 0, -1, 0, NULL, -1, -1);
1763     return CARRY_ON;
1764   }
1765 
1766   /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1767   if (is_leaf_removable (tb))
1768     return CARRY_ON;
1769 
1770   /* Balancing is not required. */
1771   tb->s0num = vn->vn_nr_item;
1772   set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1773   return NO_BALANCING_NEEDED;
1774 }
1775 
1776 
1777 
1778 /* Check whether current node S[h] is balanced when Decreasing its size by
1779  * Deleting or Cutting.
1780  * Calculate parameters for balancing for current level h.
1781  * Parameters:
1782  *	tb	tree_balance structure;
1783  *	h	current level of the node;
1784  *	inum	item number in S[h];
1785  *	mode	d - delete, c - cut.
1786  * Returns:	1 - schedule occurred;
1787  *	        0 - balancing for higher levels needed;
1788  *	       -1 - no balancing for higher levels needed;
1789  *	       -2 - no disk space.
1790  */
dc_check_balance(struct tree_balance * tb,int h)1791 static int dc_check_balance (struct tree_balance * tb, int h)
1792 {
1793  RFALSE( ! (PATH_H_PBUFFER (tb->tb_path, h)), "vs-8250: S is not initialized");
1794 
1795  if ( h )
1796    return dc_check_balance_internal (tb, h);
1797  else
1798    return dc_check_balance_leaf (tb, h);
1799 }
1800 
1801 
1802 
1803 /* Check whether current node S[h] is balanced.
1804  * Calculate parameters for balancing for current level h.
1805  * Parameters:
1806  *
1807  *	tb	tree_balance structure:
1808  *
1809  *              tb is a large structure that must be read about in the header file
1810  *              at the same time as this procedure if the reader is to successfully
1811  *              understand this procedure
1812  *
1813  *	h	current level of the node;
1814  *	inum	item number in S[h];
1815  *	mode	i - insert, p - paste, d - delete, c - cut.
1816  * Returns:	1 - schedule occurred;
1817  *	        0 - balancing for higher levels needed;
1818  *	       -1 - no balancing for higher levels needed;
1819  *	       -2 - no disk space.
1820  */
check_balance(int mode,struct tree_balance * tb,int h,int inum,int pos_in_item,struct item_head * ins_ih,const void * data)1821 static int check_balance (int mode,
1822 			  struct tree_balance * tb,
1823 			  int h,
1824 			  int inum,
1825 			  int pos_in_item,
1826 			  struct item_head * ins_ih,
1827 			  const void * data
1828 			  )
1829 {
1830   struct virtual_node * vn;
1831 
1832   vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1833   vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1834   vn->vn_mode = mode;
1835   vn->vn_affected_item_num = inum;
1836   vn->vn_pos_in_item = pos_in_item;
1837   vn->vn_ins_ih = ins_ih;
1838   vn->vn_data = data;
1839 
1840   RFALSE( mode == M_INSERT && !vn->vn_ins_ih,
1841 	  "vs-8255: ins_ih can not be 0 in insert mode");
1842 
1843  if ( tb->insert_size[h] > 0 )
1844    /* Calculate balance parameters when size of node is increasing. */
1845    return ip_check_balance (tb, h);
1846 
1847  /* Calculate balance parameters when  size of node is decreasing. */
1848  return dc_check_balance (tb, h);
1849 }
1850 
1851 
1852 
1853 /* Check whether parent at the path is the really parent of the current node.*/
get_direct_parent(struct tree_balance * p_s_tb,int n_h)1854 static int  get_direct_parent(
1855               struct tree_balance * p_s_tb,
1856               int                   n_h
1857             ) {
1858     struct buffer_head  * p_s_bh;
1859     struct path         * p_s_path      = p_s_tb->tb_path;
1860     int                   n_position,
1861 	n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1862 
1863     /* We are in the root or in the new root. */
1864     if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
1865 
1866 	RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1867 		"PAP-8260: illegal offset in the path");
1868 
1869 	if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1870 	     SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
1871 	    /* Root is not changed. */
1872 	    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
1873 	    PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
1874 	    return CARRY_ON;
1875 	}
1876 	return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */
1877     }
1878 
1879     if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) )
1880 	return REPEAT_SEARCH; /* Parent in the path is not in the tree. */
1881 
1882     if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) )
1883 	return REPEAT_SEARCH;
1884 
1885     if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr )
1886 	/* Parent in the path is not parent of the current node in the tree. */
1887 	return REPEAT_SEARCH;
1888 
1889     if ( buffer_locked(p_s_bh) ) {
1890 	__wait_on_buffer(p_s_bh);
1891 	if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
1892 	    return REPEAT_SEARCH;
1893     }
1894 
1895     return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node.  */
1896 }
1897 
1898 
1899 /* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
1900  * of S[n_h] we
1901  * need in order to balance S[n_h], and get them if necessary.
1902  * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
1903  *	        CARRY_ON - schedule didn't occur while the function worked;
1904  */
get_neighbors(struct tree_balance * p_s_tb,int n_h)1905 static int  get_neighbors(
1906 	            struct tree_balance * p_s_tb,
1907 	            int 		  n_h
1908 	          ) {
1909     int		 	n_child_position,
1910 	n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
1911     unsigned long		n_son_number;
1912     struct super_block  *	p_s_sb = p_s_tb->tb_sb;
1913     struct buffer_head  * p_s_bh;
1914 
1915 
1916     PROC_INFO_INC( p_s_sb, get_neighbors[ n_h ] );
1917 
1918     if ( p_s_tb->lnum[n_h] ) {
1919 	/* We need left neighbor to balance S[n_h]. */
1920 	PROC_INFO_INC( p_s_sb, need_l_neighbor[ n_h ] );
1921 	p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1922 
1923 	RFALSE( p_s_bh == p_s_tb->FL[n_h] &&
1924 		! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
1925 		"PAP-8270: invalid position in the parent");
1926 
1927 	n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
1928 	n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
1929 	p_s_bh = reiserfs_bread(p_s_sb, n_son_number, p_s_sb->s_blocksize);
1930 	if (!p_s_bh)
1931 	    return IO_ERROR;
1932 	if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1933 	    decrement_bcount(p_s_bh);
1934 	    PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
1935 	    return REPEAT_SEARCH;
1936 	}
1937 
1938 	RFALSE( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
1939                 n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
1940 	        B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
1941                 p_s_bh->b_blocknr, "PAP-8275: invalid parent");
1942 	RFALSE( ! B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
1943 	RFALSE( ! n_h &&
1944                 B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FL[0],n_child_position)),
1945                 "PAP-8290: invalid child size of left neighbor");
1946 
1947 	decrement_bcount(p_s_tb->L[n_h]);
1948 	p_s_tb->L[n_h] = p_s_bh;
1949     }
1950 
1951 
1952     if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */
1953 	PROC_INFO_INC( p_s_sb, need_r_neighbor[ n_h ] );
1954 	p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1955 
1956 	RFALSE( p_s_bh == p_s_tb->FR[n_h] &&
1957 		PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh),
1958 		"PAP-8295: invalid position in the parent");
1959 
1960 	n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0;
1961 	n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
1962 	p_s_bh = reiserfs_bread(p_s_sb, n_son_number, p_s_sb->s_blocksize);
1963 	if (!p_s_bh)
1964 	    return IO_ERROR;
1965 	if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1966 	    decrement_bcount(p_s_bh);
1967 	    PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
1968 	    return REPEAT_SEARCH;
1969 	}
1970 	decrement_bcount(p_s_tb->R[n_h]);
1971 	p_s_tb->R[n_h] = p_s_bh;
1972 
1973 	RFALSE( ! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)),
1974                 "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
1975                 B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh),
1976                 dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)));
1977 
1978     }
1979     return CARRY_ON;
1980 }
1981 
1982 #ifdef CONFIG_REISERFS_CHECK
reiserfs_kmalloc(size_t size,int flags,struct super_block * s)1983 void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s)
1984 {
1985     void * vp;
1986     static size_t malloced;
1987 
1988 
1989     vp = kmalloc (size, flags);
1990     if (vp) {
1991 	s->u.reiserfs_sb.s_kmallocs += size;
1992 	if (s->u.reiserfs_sb.s_kmallocs > malloced + 200000) {
1993 	    reiserfs_warning (s, "vs-8301: reiserfs_kmalloc: allocated memory %d\n", s->u.reiserfs_sb.s_kmallocs);
1994 	    malloced = s->u.reiserfs_sb.s_kmallocs;
1995 	}
1996     }
1997 /*printk ("malloc : size %d, allocated %d\n", size, s->u.reiserfs_sb.s_kmallocs);*/
1998     return vp;
1999 }
2000 
reiserfs_kfree(const void * vp,size_t size,struct super_block * s)2001 void reiserfs_kfree (const void * vp, size_t size, struct super_block * s)
2002 {
2003     if (!vp)
2004         return;
2005     kfree (vp);
2006 
2007     s->u.reiserfs_sb.s_kmallocs -= size;
2008     if (s->u.reiserfs_sb.s_kmallocs < 0)
2009 	reiserfs_warning (s, "vs-8302: reiserfs_kfree: allocated memory %d\n", s->u.reiserfs_sb.s_kmallocs);
2010 
2011 }
2012 #endif
2013 
2014 
get_virtual_node_size(struct super_block * sb,struct buffer_head * bh)2015 static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh)
2016 {
2017     int max_num_of_items;
2018     int max_num_of_entries;
2019     unsigned long blocksize = sb->s_blocksize;
2020 
2021 #define MIN_NAME_LEN 1
2022 
2023     max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2024     max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2025                          (DEH_SIZE + MIN_NAME_LEN);
2026 
2027     return sizeof(struct virtual_node) +
2028            max(max_num_of_items * sizeof (struct virtual_item),
2029 	       sizeof (struct virtual_item) + sizeof(struct direntry_uarea) +
2030                (max_num_of_entries - 1) * sizeof (__u16));
2031 }
2032 
2033 
2034 
2035 /* maybe we should fail balancing we are going to perform when kmalloc
2036    fails several times. But now it will loop until kmalloc gets
2037    required memory */
get_mem_for_virtual_node(struct tree_balance * tb)2038 static int get_mem_for_virtual_node (struct tree_balance * tb)
2039 {
2040     int check_fs = 0;
2041     int size;
2042     char * buf;
2043 
2044     size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path));
2045 
2046     if (size > tb->vn_buf_size) {
2047 	/* we have to allocate more memory for virtual node */
2048 	if (tb->vn_buf) {
2049 	    /* free memory allocated before */
2050 	    reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
2051 	    /* this is not needed if kfree is atomic */
2052             check_fs = 1;
2053 	}
2054 
2055 	/* virtual node requires now more memory */
2056 	tb->vn_buf_size = size;
2057 
2058 	/* get memory for virtual item */
2059 	buf = reiserfs_kmalloc(size, GFP_ATOMIC, tb->tb_sb);
2060 	if ( ! buf ) {
2061 	    /* getting memory with GFP_KERNEL priority may involve
2062                balancing now (due to indirect_to_direct conversion on
2063                dcache shrinking). So, release path and collected
2064                resources here */
2065 	    free_buffers_in_tb (tb);
2066 	    buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb);
2067 	    if ( !buf ) {
2068 #ifdef CONFIG_REISERFS_CHECK
2069 		reiserfs_warning (tb->tb_sb, "vs-8345: get_mem_for_virtual_node: "
2070 				  "kmalloc failed. reiserfs kmalloced %d bytes\n",
2071 				  tb->tb_sb->u.reiserfs_sb.s_kmallocs);
2072 #endif
2073 		tb->vn_buf_size = 0;
2074 	    }
2075 	    tb->vn_buf = buf;
2076 	    schedule() ;
2077 	    return REPEAT_SEARCH;
2078 	}
2079 
2080 	tb->vn_buf = buf;
2081     }
2082 
2083     if ( check_fs && FILESYSTEM_CHANGED_TB (tb) )
2084         return REPEAT_SEARCH;
2085 
2086     return CARRY_ON;
2087 }
2088 
2089 
2090 #ifdef CONFIG_REISERFS_CHECK
tb_buffer_sanity_check(struct super_block * p_s_sb,struct buffer_head * p_s_bh,const char * descr,int level)2091 static void tb_buffer_sanity_check (struct super_block * p_s_sb,
2092 				    struct buffer_head * p_s_bh,
2093 				    const char *descr, int level) {
2094   if (p_s_bh) {
2095     if (atomic_read (&(p_s_bh->b_count)) <= 0) {
2096 
2097       reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh);
2098     }
2099 
2100     if ( ! buffer_uptodate (p_s_bh) ) {
2101       reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh);
2102     }
2103 
2104     if ( ! B_IS_IN_TREE (p_s_bh) ) {
2105       reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh);
2106     }
2107 
2108     if (p_s_bh->b_dev != p_s_sb->s_dev ||
2109 	p_s_bh->b_size != p_s_sb->s_blocksize ||
2110 	p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
2111       reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): check failed for buffer %s[%d] (%b)\n", descr, level, p_s_bh);
2112     }
2113   }
2114 }
2115 #else
tb_buffer_sanity_check(struct super_block * p_s_sb,struct buffer_head * p_s_bh,const char * descr,int level)2116 static void tb_buffer_sanity_check (struct super_block * p_s_sb,
2117 				    struct buffer_head * p_s_bh,
2118 				    const char *descr, int level)
2119 {;}
2120 #endif
2121 
clear_all_dirty_bits(struct super_block * s,struct buffer_head * bh)2122 static void clear_all_dirty_bits(struct super_block *s,
2123                                  struct buffer_head *bh) {
2124   reiserfs_prepare_for_journal(s, bh, 0) ;
2125 }
2126 
wait_tb_buffers_until_unlocked(struct tree_balance * p_s_tb)2127 static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
2128 {
2129     struct buffer_head * locked;
2130 #ifdef CONFIG_REISERFS_CHECK
2131     int repeat_counter = 0;
2132 #endif
2133     int i;
2134 
2135     do {
2136 
2137 	locked = NULL;
2138 
2139 	for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) {
2140 	    if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) {
2141 		/* if I understand correctly, we can only be sure the last buffer
2142 		** in the path is in the tree --clm
2143 		*/
2144 #ifdef CONFIG_REISERFS_CHECK
2145 		if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
2146 		    PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2147 		    tb_buffer_sanity_check (p_s_tb->tb_sb,
2148 					    PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i),
2149 					    "S",
2150 					    p_s_tb->tb_path->path_length - i);
2151 		}
2152 #endif
2153 		clear_all_dirty_bits(p_s_tb->tb_sb,
2154 				     PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ;
2155 
2156 		if ( buffer_locked (PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) )
2157 		    locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
2158 	    }
2159 	}
2160 
2161 	for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) {
2162 
2163 	    if (p_s_tb->lnum[i] ) {
2164 
2165 		if ( p_s_tb->L[i] ) {
2166 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
2167 		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]) ;
2168 		    if ( buffer_locked (p_s_tb->L[i]) )
2169 			locked = p_s_tb->L[i];
2170 		}
2171 
2172 		if ( !locked && p_s_tb->FL[i] ) {
2173 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
2174 		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]) ;
2175 		    if ( buffer_locked (p_s_tb->FL[i]) )
2176 			locked = p_s_tb->FL[i];
2177 		}
2178 
2179 		if ( !locked && p_s_tb->CFL[i] ) {
2180 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
2181 		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]) ;
2182 		    if ( buffer_locked (p_s_tb->CFL[i]) )
2183 			locked = p_s_tb->CFL[i];
2184 		}
2185 
2186 	    }
2187 
2188 	    if ( !locked && (p_s_tb->rnum[i]) ) {
2189 
2190 		if ( p_s_tb->R[i] ) {
2191 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
2192 		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]) ;
2193 		    if ( buffer_locked (p_s_tb->R[i]) )
2194 			locked = p_s_tb->R[i];
2195 		}
2196 
2197 
2198 		if ( !locked && p_s_tb->FR[i] ) {
2199 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
2200 		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]) ;
2201 		    if ( buffer_locked (p_s_tb->FR[i]) )
2202 			locked = p_s_tb->FR[i];
2203 		}
2204 
2205 		if ( !locked && p_s_tb->CFR[i] ) {
2206 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
2207 		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]) ;
2208 		    if ( buffer_locked (p_s_tb->CFR[i]) )
2209 			locked = p_s_tb->CFR[i];
2210 		}
2211 	    }
2212 	}
2213 	/* as far as I can tell, this is not required.  The FEB list seems
2214 	** to be full of newly allocated nodes, which will never be locked,
2215 	** dirty, or anything else.
2216 	** To be safe, I'm putting in the checks and waits in.  For the moment,
2217 	** they are needed to keep the code in journal.c from complaining
2218 	** about the buffer.  That code is inside CONFIG_REISERFS_CHECK as well.
2219 	** --clm
2220 	*/
2221 	for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) {
2222 	    if ( p_s_tb->FEB[i] ) {
2223 		clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]) ;
2224 		if (buffer_locked(p_s_tb->FEB[i])) {
2225 		    locked = p_s_tb->FEB[i] ;
2226 		}
2227 	    }
2228 	}
2229 
2230 	if (locked) {
2231 #ifdef CONFIG_REISERFS_CHECK
2232 	    repeat_counter++;
2233 	    if ( (repeat_counter % 10000) == 0) {
2234 		reiserfs_warning (p_s_tb->tb_sb, "wait_tb_buffers_until_released(): too many iterations waiting for buffer to unlock (%b)\n", locked);
2235 
2236 		/* Don't loop forever.  Try to recover from possible error. */
2237 
2238 		return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON;
2239 	    }
2240 #endif
2241 	    __wait_on_buffer (locked);
2242 	    if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
2243 		return REPEAT_SEARCH;
2244 	    }
2245 	}
2246 
2247     } while (locked);
2248 
2249     return CARRY_ON;
2250 }
2251 
2252 
2253 /* Prepare for balancing, that is
2254  *	get all necessary parents, and neighbors;
2255  *	analyze what and where should be moved;
2256  *	get sufficient number of new nodes;
2257  * Balancing will start only after all resources will be collected at a time.
2258  *
2259  * When ported to SMP kernels, only at the last moment after all needed nodes
2260  * are collected in cache, will the resources be locked using the usual
2261  * textbook ordered lock acquisition algorithms.  Note that ensuring that
2262  * this code neither write locks what it does not need to write lock nor locks out of order
2263  * will be a pain in the butt that could have been avoided.  Grumble grumble. -Hans
2264  *
2265  * fix is meant in the sense of render unchanging
2266  *
2267  * Latency might be improved by first gathering a list of what buffers are needed
2268  * and then getting as many of them in parallel as possible? -Hans
2269  *
2270  * Parameters:
2271  *	op_mode	i - insert, d - delete, c - cut (truncate), p - paste (append)
2272  *	tb	tree_balance structure;
2273  *	inum	item number in S[h];
2274  *      pos_in_item - comment this if you can
2275  *      ins_ih & ins_sd are used when inserting
2276  * Returns:	1 - schedule occurred while the function worked;
2277  *	        0 - schedule didn't occur while the function worked;
2278  *             -1 - if no_disk_space
2279  */
2280 
2281 
fix_nodes(int n_op_mode,struct tree_balance * p_s_tb,struct item_head * p_s_ins_ih,const void * data)2282 int fix_nodes (int n_op_mode,
2283 	       struct tree_balance * 	p_s_tb,
2284 	       struct item_head * p_s_ins_ih, // item head of item being inserted
2285 	       const void * data // inserted item or data to be pasted
2286     ) {
2287     int	n_ret_value,
2288     	n_h,
2289     	n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
2290     int n_pos_in_item;
2291 
2292     /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2293     ** during wait_tb_buffers_run
2294     */
2295     int wait_tb_buffers_run = 0 ;
2296     int windex ;
2297     struct buffer_head  * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
2298 
2299     ++ p_s_tb -> tb_sb -> u.reiserfs_sb.s_fix_nodes;
2300 
2301     n_pos_in_item = p_s_tb->tb_path->pos_in_item;
2302 
2303 
2304     p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb);
2305 
2306     /* we prepare and log the super here so it will already be in the
2307     ** transaction when do_balance needs to change it.
2308     ** This way do_balance won't have to schedule when trying to prepare
2309     ** the super for logging
2310     */
2311     reiserfs_prepare_for_journal(p_s_tb->tb_sb,
2312                                  SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ;
2313     journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
2314                        SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ;
2315     if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
2316 	return REPEAT_SEARCH;
2317 
2318     /* if it possible in indirect_to_direct conversion */
2319     if (buffer_locked (p_s_tbS0)) {
2320         __wait_on_buffer (p_s_tbS0);
2321         if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
2322             return REPEAT_SEARCH;
2323     }
2324 
2325 #ifdef CONFIG_REISERFS_CHECK
2326     if ( cur_tb ) {
2327 	print_cur_tb ("fix_nodes");
2328 	reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes:  there is pending do_balance");
2329     }
2330 
2331     if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) {
2332 	reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
2333 			"at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode);
2334     }
2335 
2336     /* Check parameters. */
2337     switch (n_op_mode) {
2338     case M_INSERT:
2339 	if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) )
2340 	    reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
2341 			   n_item_num, B_NR_ITEMS(p_s_tbS0));
2342 	break;
2343     case M_PASTE:
2344     case M_DELETE:
2345     case M_CUT:
2346 	if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) {
2347 	    print_block (p_s_tbS0, 0, -1, -1);
2348 	    printk("mode = %c insert_size = %d\n", n_op_mode, p_s_tb->insert_size[0]);
2349 	    reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d)", n_item_num);
2350 	}
2351 	break;
2352     default:
2353 	reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation");
2354     }
2355 #endif
2356 
2357     if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH)
2358 	// FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2359 	return REPEAT_SEARCH;
2360 
2361 
2362     /* Starting from the leaf level; for all levels n_h of the tree. */
2363     for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) {
2364 	if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) {
2365 	    goto repeat;
2366 	}
2367 
2368 	if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num,
2369 					   n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) {
2370 	    if ( n_ret_value == NO_BALANCING_NEEDED ) {
2371 		/* No balancing for higher levels needed. */
2372 		if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
2373 		    goto repeat;
2374 		}
2375 		if ( n_h != MAX_HEIGHT - 1 )
2376 		    p_s_tb->insert_size[n_h + 1] = 0;
2377 		/* ok, analysis and resource gathering are complete */
2378 		break;
2379 	    }
2380 	    goto repeat;
2381 	}
2382 
2383 	if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
2384 	    goto repeat;
2385 	}
2386 
2387 	if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) {
2388 	    goto repeat;        /* No disk space, or schedule occurred and
2389 				   analysis may be invalid and needs to be redone. */
2390 	}
2391 
2392 	if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) {
2393 	    /* We have a positive insert size but no nodes exist on this
2394 	       level, this means that we are creating a new root. */
2395 
2396 	    RFALSE( p_s_tb->blknum[n_h] != 1,
2397 		    "PAP-8350: creating new empty root");
2398 
2399 	    if ( n_h < MAX_HEIGHT - 1 )
2400 		p_s_tb->insert_size[n_h + 1] = 0;
2401 	}
2402 	else
2403 	    if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) {
2404 		if ( p_s_tb->blknum[n_h] > 1 ) {
2405 		    /* The tree needs to be grown, so this node S[n_h]
2406 		       which is the root node is split into two nodes,
2407 		       and a new node (S[n_h+1]) will be created to
2408 		       become the root node.  */
2409 
2410 		    RFALSE( n_h == MAX_HEIGHT - 1,
2411 			    "PAP-8355: attempt to create too high of a tree");
2412 
2413 		    p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE;
2414 		}
2415 		else
2416 		    if ( n_h < MAX_HEIGHT - 1 )
2417 			p_s_tb->insert_size[n_h + 1] = 0;
2418 	    }
2419 	    else
2420 		p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
2421     }
2422 
2423 
2424     windex = push_journal_writer("fix_nodes") ;
2425     if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) {
2426 	pop_journal_writer(windex) ;
2427 	if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2428 	    wait_tb_buffers_run = 1 ;
2429 	    n_ret_value = REPEAT_SEARCH ;
2430 	    goto repeat;
2431 	} else {
2432 	    return CARRY_ON;
2433 	}
2434     } else {
2435 	wait_tb_buffers_run = 1 ;
2436 	pop_journal_writer(windex) ;
2437 	goto repeat;
2438     }
2439 
2440  repeat:
2441     // fix_nodes was unable to perform its calculation due to
2442     // filesystem got changed under us, lack of free disk space or i/o
2443     // failure. If the first is the case - the search will be
2444     // repeated. For now - free all resources acquired so far except
2445     // for the new allocated nodes
2446     {
2447 	int i;
2448 
2449 	/* Release path buffers. */
2450 	if (wait_tb_buffers_run) {
2451 	    pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ;
2452 	} else {
2453 	    pathrelse (p_s_tb->tb_path);
2454         }
2455 	/* brelse all resources collected for balancing */
2456 	for ( i = 0; i < MAX_HEIGHT; i++ ) {
2457 	    if (wait_tb_buffers_run) {
2458 		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]);
2459 		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]);
2460 		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]);
2461 		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]);
2462 		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]);
2463 		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]);
2464 	    }
2465 
2466 	    brelse (p_s_tb->L[i]);p_s_tb->L[i] = 0;
2467 	    brelse (p_s_tb->R[i]);p_s_tb->R[i] = 0;
2468 	    brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = 0;
2469 	    brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = 0;
2470 	    brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = 0;
2471 	    brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = 0;
2472 	}
2473 
2474 	if (wait_tb_buffers_run) {
2475 	    for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
2476 		if ( p_s_tb->FEB[i] ) {
2477 		    reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2478 						     p_s_tb->FEB[i]) ;
2479 		}
2480 	    }
2481 	}
2482 	return n_ret_value;
2483     }
2484 
2485 }
2486 
2487 
2488 /* Anatoly will probably forgive me renaming p_s_tb to tb. I just
2489    wanted to make lines shorter */
unfix_nodes(struct tree_balance * tb)2490 void unfix_nodes (struct tree_balance * tb)
2491 {
2492     int	i;
2493 
2494     /* Release path buffers. */
2495     pathrelse_and_restore (tb->tb_sb, tb->tb_path);
2496 
2497     /* brelse all resources collected for balancing */
2498     for ( i = 0; i < MAX_HEIGHT; i++ ) {
2499 	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]);
2500 	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]);
2501 	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]);
2502 	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]);
2503 	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]);
2504 	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]);
2505 
2506 	brelse (tb->L[i]);
2507 	brelse (tb->R[i]);
2508 	brelse (tb->FL[i]);
2509 	brelse (tb->FR[i]);
2510 	brelse (tb->CFL[i]);
2511 	brelse (tb->CFR[i]);
2512     }
2513 
2514     /* deal with list of allocated (used and unused) nodes */
2515     for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
2516 	if ( tb->FEB[i] ) {
2517 	    unsigned long blocknr  = tb->FEB[i]->b_blocknr ;
2518 	    /* de-allocated block which was not used by balancing and
2519                bforget about buffer for it */
2520 	    brelse (tb->FEB[i]);
2521 	    reiserfs_free_block (tb->transaction_handle, blocknr);
2522 	}
2523 	if (tb->used[i]) {
2524 	    /* release used as new nodes including a new root */
2525 	    brelse (tb->used[i]);
2526 	}
2527     }
2528 
2529     if (tb->vn_buf)
2530     reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
2531 
2532 }
2533 
2534 
2535 
2536