diff options
Diffstat (limited to 'drivers/hv/hv_balloon.c')
| -rw-r--r-- | drivers/hv/hv_balloon.c | 484 | 
1 files changed, 430 insertions, 54 deletions
| diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 37873213e24..d5225261ee5 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -412,13 +412,45 @@ struct dm_info_msg {   * End protocol definitions.   */ -static bool hot_add; +/* + * State to manage hot adding memory into the guest. + * The range start_pfn : end_pfn specifies the range + * that the host has asked us to hot add. The range + * start_pfn : ha_end_pfn specifies the range that we have + * currently hot added. We hot add in multiples of 128M + * chunks; it is possible that we may not be able to bring + * online all the pages in the region. The range + * covered_start_pfn : covered_end_pfn defines the pages that can + * be brough online. + */ + +struct hv_hotadd_state { +	struct list_head list; +	unsigned long start_pfn; +	unsigned long covered_start_pfn; +	unsigned long covered_end_pfn; +	unsigned long ha_end_pfn; +	unsigned long end_pfn; +}; + +struct balloon_state { +	__u32 num_pages; +	struct work_struct wrk; +}; + +struct hot_add_wrk { +	union dm_mem_page_range ha_page_range; +	union dm_mem_page_range ha_region_range; +	struct work_struct wrk; +}; + +static bool hot_add = true;  static bool do_hot_add;  /*   * Delay reporting memory pressure by   * the specified number of seconds.   */ -static uint pressure_report_delay = 30; +static uint pressure_report_delay = 45;  module_param(hot_add, bool, (S_IRUGO | S_IWUSR));  MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); @@ -446,6 +478,7 @@ enum hv_dm_state {  static __u8 recv_buffer[PAGE_SIZE];  static __u8 *send_buffer;  #define PAGES_IN_2M	512 +#define HA_CHUNK (32 * 1024)  struct hv_dynmem_device {  	struct hv_device *dev; @@ -459,7 +492,28 @@ struct hv_dynmem_device {  	unsigned int num_pages_ballooned;  	/* -	 * This thread handles both balloon/hot-add +	 * State to manage the ballooning (up) operation. +	 */ +	struct balloon_state balloon_wrk; + +	/* +	 * State to execute the "hot-add" operation. +	 */ +	struct hot_add_wrk ha_wrk; + +	/* +	 * This state tracks if the host has specified a hot-add +	 * region. +	 */ +	bool host_specified_ha_region; + +	/* +	 * State to synchronize hot-add. +	 */ +	struct completion  ol_waitevent; +	bool ha_waiting; +	/* +	 * This thread handles hot-add  	 * requests from the host as well as notifying  	 * the host with regards to memory pressure in  	 * the guest. @@ -467,6 +521,11 @@ struct hv_dynmem_device {  	struct task_struct *thread;  	/* +	 * A list of hot-add regions. +	 */ +	struct list_head ha_region_list; + +	/*  	 * We start with the highest version we can support  	 * and downgrade based on the host; we save here the  	 * next version to try. @@ -476,35 +535,329 @@ struct hv_dynmem_device {  static struct hv_dynmem_device dm_device; -static void hot_add_req(struct hv_dynmem_device *dm, struct dm_hot_add *msg) +#ifdef CONFIG_MEMORY_HOTPLUG + +static void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size)  { +	int i; -	struct dm_hot_add_response resp; +	for (i = 0; i < size; i++) { +		struct page *pg; +		pg = pfn_to_page(start_pfn + i); +		__online_page_set_limits(pg); +		__online_page_increment_counters(pg); +		__online_page_free(pg); +	} +} + +static void hv_mem_hot_add(unsigned long start, unsigned long size, +				unsigned long pfn_count, +				struct hv_hotadd_state *has) +{ +	int ret = 0; +	int i, nid, t; +	unsigned long start_pfn; +	unsigned long processed_pfn; +	unsigned long total_pfn = pfn_count; + +	for (i = 0; i < (size/HA_CHUNK); i++) { +		start_pfn = start + (i * HA_CHUNK); +		has->ha_end_pfn +=  HA_CHUNK; + +		if (total_pfn > HA_CHUNK) { +			processed_pfn = HA_CHUNK; +			total_pfn -= HA_CHUNK; +		} else { +			processed_pfn = total_pfn; +			total_pfn = 0; +		} + +		has->covered_end_pfn +=  processed_pfn; + +		init_completion(&dm_device.ol_waitevent); +		dm_device.ha_waiting = true; + +		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); +		ret = add_memory(nid, PFN_PHYS((start_pfn)), +				(HA_CHUNK << PAGE_SHIFT)); + +		if (ret) { +			pr_info("hot_add memory failed error is %d\n", ret); +			has->ha_end_pfn -= HA_CHUNK; +			has->covered_end_pfn -=  processed_pfn; +			break; +		} + +		/* +		 * Wait for the memory block to be onlined. +		 */ +		t = wait_for_completion_timeout(&dm_device.ol_waitevent, 5*HZ); +		if (t == 0) { +			pr_info("hot_add memory timedout\n"); +			has->ha_end_pfn -= HA_CHUNK; +			has->covered_end_pfn -=  processed_pfn; +			break; +		} + +	} + +	return; +} + +static void hv_online_page(struct page *pg) +{ +	struct list_head *cur; +	struct hv_hotadd_state *has; +	unsigned long cur_start_pgp; +	unsigned long cur_end_pgp; + +	if (dm_device.ha_waiting) { +		dm_device.ha_waiting = false; +		complete(&dm_device.ol_waitevent); +	} + +	list_for_each(cur, &dm_device.ha_region_list) { +		has = list_entry(cur, struct hv_hotadd_state, list); +		cur_start_pgp = (unsigned long) +				pfn_to_page(has->covered_start_pfn); +		cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); + +		if (((unsigned long)pg >= cur_start_pgp) && +			((unsigned long)pg < cur_end_pgp)) { +			/* +			 * This frame is currently backed; online the +			 * page. +			 */ +			__online_page_set_limits(pg); +			__online_page_increment_counters(pg); +			__online_page_free(pg); +			has->covered_start_pfn++; +		} +	} +} + +static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) +{ +	struct list_head *cur; +	struct hv_hotadd_state *has; +	unsigned long residual, new_inc; + +	if (list_empty(&dm_device.ha_region_list)) +		return false; + +	list_for_each(cur, &dm_device.ha_region_list) { +		has = list_entry(cur, struct hv_hotadd_state, list); + +		/* +		 * If the pfn range we are dealing with is not in the current +		 * "hot add block", move on. +		 */ +		if ((start_pfn >= has->end_pfn)) +			continue; +		/* +		 * If the current hot add-request extends beyond +		 * our current limit; extend it. +		 */ +		if ((start_pfn + pfn_cnt) > has->end_pfn) { +			residual = (start_pfn + pfn_cnt - has->end_pfn); +			/* +			 * Extend the region by multiples of HA_CHUNK. +			 */ +			new_inc = (residual / HA_CHUNK) * HA_CHUNK; +			if (residual % HA_CHUNK) +				new_inc += HA_CHUNK; + +			has->end_pfn += new_inc; +		} + +		/* +		 * If the current start pfn is not where the covered_end +		 * is, update it. +		 */ + +		if (has->covered_end_pfn != start_pfn) { +			has->covered_end_pfn = start_pfn; +			has->covered_start_pfn = start_pfn; +		} +		return true; + +	} -	if (do_hot_add) { +	return false; +} + +static unsigned long handle_pg_range(unsigned long pg_start, +					unsigned long pg_count) +{ +	unsigned long start_pfn = pg_start; +	unsigned long pfn_cnt = pg_count; +	unsigned long size; +	struct list_head *cur; +	struct hv_hotadd_state *has; +	unsigned long pgs_ol = 0; +	unsigned long old_covered_state; + +	if (list_empty(&dm_device.ha_region_list)) +		return 0; -		pr_info("Memory hot add not supported\n"); +	list_for_each(cur, &dm_device.ha_region_list) { +		has = list_entry(cur, struct hv_hotadd_state, list);  		/* -		 * Currently we do not support hot add. -		 * Just fail the request. +		 * If the pfn range we are dealing with is not in the current +		 * "hot add block", move on.  		 */ +		if ((start_pfn >= has->end_pfn)) +			continue; + +		old_covered_state = has->covered_end_pfn; + +		if (start_pfn < has->ha_end_pfn) { +			/* +			 * This is the case where we are backing pages +			 * in an already hot added region. Bring +			 * these pages online first. +			 */ +			pgs_ol = has->ha_end_pfn - start_pfn; +			if (pgs_ol > pfn_cnt) +				pgs_ol = pfn_cnt; +			hv_bring_pgs_online(start_pfn, pgs_ol); +			has->covered_end_pfn +=  pgs_ol; +			has->covered_start_pfn +=  pgs_ol; +			pfn_cnt -= pgs_ol; +		} + +		if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { +			/* +			 * We have some residual hot add range +			 * that needs to be hot added; hot add +			 * it now. Hot add a multiple of +			 * of HA_CHUNK that fully covers the pages +			 * we have. +			 */ +			size = (has->end_pfn - has->ha_end_pfn); +			if (pfn_cnt <= size) { +				size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); +				if (pfn_cnt % HA_CHUNK) +					size += HA_CHUNK; +			} else { +				pfn_cnt = size; +			} +			hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has); +		} +		/* +		 * If we managed to online any pages that were given to us, +		 * we declare success. +		 */ +		return has->covered_end_pfn - old_covered_state; +  	} +	return 0; +} + +static unsigned long process_hot_add(unsigned long pg_start, +					unsigned long pfn_cnt, +					unsigned long rg_start, +					unsigned long rg_size) +{ +	struct hv_hotadd_state *ha_region = NULL; + +	if (pfn_cnt == 0) +		return 0; + +	if (!dm_device.host_specified_ha_region) +		if (pfn_covered(pg_start, pfn_cnt)) +			goto do_pg_range; + +	/* +	 * If the host has specified a hot-add range; deal with it first. +	 */ + +	if ((rg_size != 0) && (!dm_device.host_specified_ha_region)) { +		ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL); +		if (!ha_region) +			return 0; + +		INIT_LIST_HEAD(&ha_region->list); + +		list_add_tail(&ha_region->list, &dm_device.ha_region_list); +		ha_region->start_pfn = rg_start; +		ha_region->ha_end_pfn = rg_start; +		ha_region->covered_start_pfn = pg_start; +		ha_region->covered_end_pfn = pg_start; +		ha_region->end_pfn = rg_start + rg_size; +	} + +do_pg_range: +	/* +	 * Process the page range specified; bringing them +	 * online if possible. +	 */ +	return handle_pg_range(pg_start, pfn_cnt); +} + +#endif + +static void hot_add_req(struct work_struct *dummy) +{ +	struct dm_hot_add_response resp; +#ifdef CONFIG_MEMORY_HOTPLUG +	unsigned long pg_start, pfn_cnt; +	unsigned long rg_start, rg_sz; +#endif +	struct hv_dynmem_device *dm = &dm_device; +  	memset(&resp, 0, sizeof(struct dm_hot_add_response));  	resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE;  	resp.hdr.size = sizeof(struct dm_hot_add_response);  	resp.hdr.trans_id = atomic_inc_return(&trans_id); -	resp.page_count = 0; -	resp.result = 0; +#ifdef CONFIG_MEMORY_HOTPLUG +	pg_start = dm->ha_wrk.ha_page_range.finfo.start_page; +	pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt; + +	rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; +	rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; + +	if ((rg_start == 0) && (!dm->host_specified_ha_region)) { +		unsigned long region_size; +		unsigned long region_start; + +		/* +		 * The host has not specified the hot-add region. +		 * Based on the hot-add page range being specified, +		 * compute a hot-add region that can cover the pages +		 * that need to be hot-added while ensuring the alignment +		 * and size requirements of Linux as it relates to hot-add. +		 */ +		region_start = pg_start; +		region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; +		if (pfn_cnt % HA_CHUNK) +			region_size += HA_CHUNK; + +		region_start = (pg_start / HA_CHUNK) * HA_CHUNK; + +		rg_start = region_start; +		rg_sz = region_size; +	} + +	resp.page_count = process_hot_add(pg_start, pfn_cnt, +					rg_start, rg_sz); +#endif +	if (resp.page_count > 0) +		resp.result = 1; +	else +		resp.result = 0; + +	if (!do_hot_add || (resp.page_count == 0)) +		pr_info("Memory hot add failed\n");  	dm->state = DM_INITIALIZED;  	vmbus_sendpacket(dm->dev->channel, &resp,  			sizeof(struct dm_hot_add_response),  			(unsigned long)NULL,  			VM_PKT_DATA_INBAND, 0); -  }  static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) @@ -523,7 +876,7 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg)  	}  } -unsigned long compute_balloon_floor(void) +static unsigned long compute_balloon_floor(void)  {  	unsigned long min_pages;  #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) @@ -657,9 +1010,9 @@ static int  alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages, -static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req) +static void balloon_up(struct work_struct *dummy)  { -	int num_pages = req->num_pages; +	int num_pages = dm_device.balloon_wrk.num_pages;  	int num_ballooned = 0;  	struct dm_balloon_response *bl_resp;  	int alloc_unit; @@ -684,14 +1037,14 @@ static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req)  		num_pages -= num_ballooned; -		num_ballooned = alloc_balloon_pages(dm, num_pages, +		num_ballooned = alloc_balloon_pages(&dm_device, num_pages,  						bl_resp, alloc_unit,  						 &alloc_error);  		if ((alloc_error) || (num_ballooned == num_pages)) {  			bl_resp->more_pages = 0;  			done = true; -			dm->state = DM_INITIALIZED; +			dm_device.state = DM_INITIALIZED;  		}  		/* @@ -719,7 +1072,7 @@ static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req)  			pr_info("Balloon response failed\n");  			for (i = 0; i < bl_resp->range_count; i++) -				free_balloon_pages(dm, +				free_balloon_pages(&dm_device,  						 &bl_resp->range_array[i]);  			done = true; @@ -761,7 +1114,6 @@ static int dm_thread_func(void *dm_dev)  {  	struct hv_dynmem_device *dm = dm_dev;  	int t; -	unsigned long  scan_start;  	while (!kthread_should_stop()) {  		t = wait_for_completion_timeout(&dm_device.config_event, 1*HZ); @@ -773,22 +1125,6 @@ static int dm_thread_func(void *dm_dev)  		if (t == 0)  			post_status(dm); -		scan_start = jiffies; -		switch (dm->state) { -		case DM_BALLOON_UP: -			balloon_up(dm, (struct dm_balloon *)recv_buffer); -			break; - -		case DM_HOT_ADD: -			hot_add_req(dm, (struct dm_hot_add *)recv_buffer); -			break; -		default: -			break; -		} - -		if (!time_in_range(jiffies, scan_start, scan_start + HZ)) -			post_status(dm); -  	}  	return 0; @@ -861,6 +1197,10 @@ static void balloon_onchannelcallback(void *context)  	struct dm_message *dm_msg;  	struct dm_header *dm_hdr;  	struct hv_dynmem_device *dm = hv_get_drvdata(dev); +	struct dm_balloon *bal_msg; +	struct dm_hot_add *ha_msg; +	union dm_mem_page_range *ha_pg_range; +	union dm_mem_page_range *ha_region;  	memset(recv_buffer, 0, sizeof(recv_buffer));  	vmbus_recvpacket(dev->channel, recv_buffer, @@ -882,8 +1222,12 @@ static void balloon_onchannelcallback(void *context)  			break;  		case DM_BALLOON_REQUEST: +			if (dm->state == DM_BALLOON_UP) +				pr_warn("Currently ballooning\n"); +			bal_msg = (struct dm_balloon *)recv_buffer;  			dm->state = DM_BALLOON_UP; -			complete(&dm->config_event); +			dm_device.balloon_wrk.num_pages = bal_msg->num_pages; +			schedule_work(&dm_device.balloon_wrk.wrk);  			break;  		case DM_UNBALLOON_REQUEST: @@ -893,8 +1237,31 @@ static void balloon_onchannelcallback(void *context)  			break;  		case DM_MEM_HOT_ADD_REQUEST: +			if (dm->state == DM_HOT_ADD) +				pr_warn("Currently hot-adding\n");  			dm->state = DM_HOT_ADD; -			complete(&dm->config_event); +			ha_msg = (struct dm_hot_add *)recv_buffer; +			if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) { +				/* +				 * This is a normal hot-add request specifying +				 * hot-add memory. +				 */ +				ha_pg_range = &ha_msg->range; +				dm->ha_wrk.ha_page_range = *ha_pg_range; +				dm->ha_wrk.ha_region_range.page_range = 0; +			} else { +				/* +				 * Host is specifying that we first hot-add +				 * a region and then partially populate this +				 * region. +				 */ +				dm->host_specified_ha_region = true; +				ha_pg_range = &ha_msg->range; +				ha_region = &ha_pg_range[1]; +				dm->ha_wrk.ha_page_range = *ha_pg_range; +				dm->ha_wrk.ha_region_range = *ha_region; +			} +			schedule_work(&dm_device.ha_wrk.wrk);  			break;  		case DM_INFO_MESSAGE: @@ -937,6 +1304,10 @@ static int balloon_probe(struct hv_device *dev,  	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7;  	init_completion(&dm_device.host_event);  	init_completion(&dm_device.config_event); +	INIT_LIST_HEAD(&dm_device.ha_region_list); +	INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); +	INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); +	dm_device.host_specified_ha_region = false;  	dm_device.thread =  		 kthread_run(dm_thread_func, &dm_device, "hv_balloon"); @@ -945,6 +1316,10 @@ static int balloon_probe(struct hv_device *dev,  		goto probe_error1;  	} +#ifdef CONFIG_MEMORY_HOTPLUG +	set_online_page_callback(&hv_online_page); +#endif +  	hv_set_drvdata(dev, &dm_device);  	/*  	 * Initiate the hand shake with the host and negotiate @@ -962,8 +1337,7 @@ static int balloon_probe(struct hv_device *dev,  	ret = vmbus_sendpacket(dev->channel, &version_req,  				sizeof(struct dm_version_request),  				(unsigned long)NULL, -				VM_PKT_DATA_INBAND, -				VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); +				VM_PKT_DATA_INBAND, 0);  	if (ret)  		goto probe_error2; @@ -990,12 +1364,6 @@ static int balloon_probe(struct hv_device *dev,  	cap_msg.hdr.trans_id = atomic_inc_return(&trans_id);  	cap_msg.caps.cap_bits.balloon = 1; -	/* -	 * While we currently don't support hot-add, -	 * we still advertise this capability since the -	 * host requires that guests partcipating in the -	 * dynamic memory protocol support hot add. -	 */  	cap_msg.caps.cap_bits.hot_add = 1;  	/* @@ -1009,8 +1377,7 @@ static int balloon_probe(struct hv_device *dev,  	ret = vmbus_sendpacket(dev->channel, &cap_msg,  				sizeof(struct dm_capabilities),  				(unsigned long)NULL, -				VM_PKT_DATA_INBAND, -				VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); +				VM_PKT_DATA_INBAND, 0);  	if (ret)  		goto probe_error2; @@ -1034,6 +1401,9 @@ static int balloon_probe(struct hv_device *dev,  	return 0;  probe_error2: +#ifdef CONFIG_MEMORY_HOTPLUG +	restore_online_page_callback(&hv_online_page); +#endif  	kthread_stop(dm_device.thread);  probe_error1: @@ -1046,13 +1416,26 @@ probe_error0:  static int balloon_remove(struct hv_device *dev)  {  	struct hv_dynmem_device *dm = hv_get_drvdata(dev); +	struct list_head *cur, *tmp; +	struct hv_hotadd_state *has;  	if (dm->num_pages_ballooned != 0)  		pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); +	cancel_work_sync(&dm->balloon_wrk.wrk); +	cancel_work_sync(&dm->ha_wrk.wrk); +  	vmbus_close(dev->channel);  	kthread_stop(dm->thread);  	kfree(send_buffer); +#ifdef CONFIG_MEMORY_HOTPLUG +	restore_online_page_callback(&hv_online_page); +#endif +	list_for_each_safe(cur, tmp, &dm->ha_region_list) { +		has = list_entry(cur, struct hv_hotadd_state, list); +		list_del(&has->list); +		kfree(has); +	}  	return 0;  } @@ -1079,14 +1462,7 @@ static int __init init_balloon_drv(void)  	return vmbus_driver_register(&balloon_drv);  } -static void exit_balloon_drv(void) -{ - -	vmbus_driver_unregister(&balloon_drv); -} -  module_init(init_balloon_drv); -module_exit(exit_balloon_drv);  MODULE_DESCRIPTION("Hyper-V Balloon");  MODULE_VERSION(HV_DRV_VERSION); |