puppy居
puppy居士
posts - 41,comments - 27,trackbacks - 0

Linux块设备层分析(1)

R.wen

一、综述

1是块设备操作的一个分层实现图。当一个进程调用read读取一个文件时,内核执行如下一个过程:首先,它通过VFS层去读取要到的文件块有没有已经被cache了,这个cache由一个buffer_head结构读取。如果要读取的文件块还没有被cache,则就要从文件系统中去读取了,这就是文件系统的映射层,它通过一个address_space结构来引用,然后调用文件系统读函数(readpage)去读取一个页面大小的数据,这个读函数对于不同的文件系统来说,是不一样的。当它从磁盘中读出数据时,它会将数据页链入cache中,当下次再读取时,就不需要再次从磁盘出去读了。Readpage()函数并不是直接去操作磁盘,而只是将请求初始化成一个bio结构,并提交给通用块层(generic block layer)


1

       它就通过submit_bio()去完成的。通用块层再调用相应设备的IO调度器,通过这个调度器的调度算法,将这个bio或合并到已存在的request中,或创建一个新的request,并将这个新创建的request插入到设备的请求队列中去。这就完成了IO调度层的工作。最后就是块设备驱动所做的工作了。IO调度器传递给块驱动的是一个请求队列,块驱动就是要处理这个队列中的请求,直到这个队列为空为止。

二、通用块层(generic block layer)

通用块层操作的是一个bio结构,这个结构主要的数据域是,

unsigned short              bi_vcnt;

struct bio_vec        *bi_io_vec;     /* the actual vec list */

这个就是要读写的数据向量,且每个struct bio_vec       为一个segment

//这个函数主要是调用generic_make_request()去完成工作:

void submit_bio(int rw, struct bio *bio)

{

       ……

       generic_make_request(bio);

}

//这个函数的主要作用是将bio传递给驱动去处理

void generic_make_request(struct bio *bio)

{

       ……

       do {

              char b[BDEVNAME_SIZE];

              //取得块设备相应的队列,每个设备一个

              q = bdev_get_queue(bio->bi_bdev);

             

              /*

              * If this device has partitions, remap block n

              * of partition p to block n+start(p) of the disk.

              */

              blk_partition_remap(bio); //块设备分区信息转换,如将相对于一个分区的的偏移地址转换成相对于整个块设备的绝对偏移等等。

             

              old_sector = bio->bi_sector;

              old_dev = bio->bi_bdev->bd_dev;

              ……

              //这个是块设备队列的请求处理函数。由块设备创建请求队列时初始化。

              //对于IDE等设备,它是__make_request()。但对于ramdisk就不一样了。

              ret = q->make_request_fn(q, bio); // __make_request()

       } while (ret);

}

//这要函数的主要作用就是调用IO调度算法将bio合并,或插入到队列中合适的位置中去

static int __make_request(request_queue_t *q, struct bio *bio)

{

       struct request *req;

       int el_ret, nr_sectors, barrier, err;

       const unsigned short prio = bio_prio(bio);

       const int sync = bio_sync(bio);

       int rw_flags;

       nr_sectors = bio_sectors(bio);

       //用于处理高端内存

       blk_queue_bounce(q, &bio);

      

       spin_lock_irq(q->queue_lock);

       //测试是否能合并,本文忽略IO调度算法

       el_ret = elv_merge(q, &req, bio);

       switch (el_ret) {

              //前两种可以合并

              case ELEVATOR_BACK_MERGE:

                     ……

                     goto out;

              case ELEVATOR_FRONT_MERGE:

                     ……

                     goto out;

             

//不能合并,需要新创一个request

              /* ELV_NO_MERGE: elevator says don't/can't merge. */

              default:

                     ;

       }

get_rq:

      

       rw_flags = bio_data_dir(bio);

       if (sync)

              rw_flags |= REQ_RW_SYNC;

       //新创一个request

       req = get_request_wait(q, rw_flags, bio);

       //初始化这个request

       init_request_from_bio(req, bio);

       spin_lock_irq(q->queue_lock);

       if (elv_queue_empty(q)) //空队列的处理

              blk_plug_device(q);

       add_request(q, req); //将新请求加入队列中去

out:

       if (sync) //如果需要同步,立即处理请求

              __generic_unplug_device(q);

       spin_unlock_irq(q->queue_lock);

       return 0;

end_io:

       bio_endio(bio, nr_sectors << 9, err);

       return 0;

}

//触发块设备驱动进行真正的IO操作

void __generic_unplug_device(request_queue_t *q)

{

       if (unlikely(blk_queue_stopped(q)))

              return;

       if (!blk_remove_plug(q))

              return;

       q->request_fn(q); //设备的请求处理函数,属于驱动层

}    

posted @ 2008-08-22 15:17 puppy 阅读(1060) | 评论 (0)编辑 收藏

二、文件读过程

我们先看标准的读过程。

1、准备工作。通过VFS层,及一些初始化操作,为真正的读操作做准备。

首先是用户进程通过read系统调用发出一个读请求:

asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)

{

       struct file *file;

       ssize_t ret = -EBADF;

       int fput_needed;

       file = fget_light(fd, &fput_needed);

       if (file) {

              loff_t pos = file_pos_read(file);

              ret = vfs_read(file, buf, count, &pos);

              file_pos_write(file, pos);

              fput_light(file, fput_needed);

       }

       return ret;

}

然后通过VFS层操作:

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)

{

       ssize_t ret;

       …… //一些检查

       ret = rw_verify_area(READ, file, pos, count);

       if (ret >= 0) {

              count = ret;

              ret = security_file_permission (file, MAY_READ);

              if (!ret) {

                     if (file->f_op->read)

                            ret = file->f_op->read(file, buf, count, pos);

                     else

                            ret = do_sync_read(file, buf, count, pos);

                     ……

              }

       }

       return ret;

}

对于ext2文件系统,有:

const struct file_operations ext2_file_operations = {

       .llseek            = generic_file_llseek,

       .read              = do_sync_read,

       .write             = do_sync_write,

       .aio_read = generic_file_aio_read,

       .aio_write       = generic_file_aio_write,

       .ioctl              = ext2_ioctl,

#ifdef CONFIG_COMPAT

       .compat_ioctl = ext2_compat_ioctl,

#endif

       .mmap           = generic_file_mmap,

       .open             = generic_file_open,

       .release    = ext2_release_file,

       .fsync            = ext2_sync_file,

       .sendfile = generic_file_sendfile,

       .splice_read    = generic_file_splice_read,

       .splice_write   = generic_file_splice_write,

};

所以它执行的是:

ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)

{

       //初始化iov, kiocb两个数据结构

       struct iovec iov = { .iov_base = buf, .iov_len = len };

       struct kiocb kiocb;

       ssize_t ret;

       init_sync_kiocb(&kiocb, filp);

       kiocb.ki_pos = *ppos;

       kiocb.ki_left = len;

       for (;;) {

              ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);

              if (ret != -EIOCBRETRY)

                     break;

              wait_on_retry_sync_kiocb(&kiocb);

       }

       if (-EIOCBQUEUED == ret)

              ret = wait_on_sync_kiocb(&kiocb);

       *ppos = kiocb.ki_pos;

       return ret;

}

可以看,它最后还是调用了aio_read()接口函数来完成读操作,即在2.6中,aio_read()为同步和异步读操作的通用接口,由上可以看到,对于ext2,它是generic_file_aio_read

/**

* generic_file_aio_read - generic filesystem read routine

* @iocb:       kernel I/O control block

* @iov: io vector request

* @nr_segs: number of segments in the iovec

* @pos: current file position

*

* This is the "read()" routine for all filesystems

* that can use the page cache directly.

*/

ssize_t

generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,

              unsigned long nr_segs, loff_t pos)

{

       struct file *filp = iocb->ki_filp;

       ssize_t retval;

       unsigned long seg;

       size_t count;

       loff_t *ppos = &iocb->ki_pos;

       …….//一些检查   

       /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */

       if (filp->f_flags & O_DIRECT) {

              ……//直接IO,我们这里先跳过

       }

       retval = 0;

       if (count) {

              for (seg = 0; seg < nr_segs; seg++) {

                     read_descriptor_t desc; //一个读描述符结构

                     desc.written = 0;

                     desc.arg.buf = iov[seg].iov_base;

                     desc.count = iov[seg].iov_len;

                     if (desc.count == 0)

                            continue;

                     desc.error = 0;

                     do_generic_file_read(filp,ppos,&desc,file_read_actor);

                     retval += desc.written;

                     if (desc.error) {

                            retval = retval ?: desc.error;

                            break;

                     }

              }

       }

out:

       return retval;

}

static inline void do_generic_file_read(struct file * filp, loff_t *ppos,

                                   read_descriptor_t * desc,

                                   read_actor_t actor)

{

       do_generic_mapping_read(filp->f_mapping,

                            &filp->f_ra,

                            filp,

                            ppos,

                            desc,

                            actor);

}

posted @ 2008-08-22 15:16 puppy 阅读(480) | 评论 (0)编辑 收藏

2、读入操作。完成了上面的准备工作,下一步就是执行读操作的核心函数do_generic_mapping_read这是一个比较复杂的函数,里面有大量的goto跳转,但还是比较清晰的。

       它工作过程可以描述如下:

a.       如果所要读取的文件在页面缓存中,则跳转到步骤d

b.       文件还没有被缓冲,所以要从设备中去读取,首先分配一个页面,并将这个页面链入到相应的address_space中去

c.       然后调用address_space中的readpage()函数,去从设备中读出一个页面大小的数据到这个页面缓存中。

d.       检查PageUptodate(page)

e.       调用由参数传入的actor函数指针,在此为file_read_actor(),将数据中页面缓存中拷贝到用户缓冲区。

f.        如果请求读取的数据长度已完成,则函数返回,否则跳转到步骤a重复执行。

先看看file_read_actor()

int file_read_actor(read_descriptor_t *desc, struct page *page,

                     unsigned long offset, unsigned long size)

{

       char *kaddr;

       unsigned long left, count = desc->count;

       if (size > count)

              size = count;

……

       /* Do it the slow way */

       kaddr = kmap(page);

       left = __copy_to_user(desc->arg.buf, kaddr + offset, size); //将数据拷贝到用户空间

       kunmap(page);

       if (left) {

              size -= left;

              desc->error = -EFAULT;

       }

success:

       desc->count = count - size;

       desc->written += size;

       desc->arg.buf += size;

       return size;

}

/**

* This is a generic file read routine, and uses the

* mapping->a_ops->readpage() function for the actual low-level stuff.

*/

void do_generic_mapping_read(struct address_space *mapping,

                          struct file_ra_state *_ra,

                          struct file *filp,

                          loff_t *ppos,

                          read_descriptor_t *desc,

                          read_actor_t actor)

{

       struct inode *inode = mapping->host;

       unsigned long index;

       unsigned long end_index;

       unsigned long offset;

       unsigned long last_index;

       unsigned long next_index;

       unsigned long prev_index;

       loff_t isize;

       struct page *cached_page;

       int error;

       struct file_ra_state ra = *_ra;

       cached_page = NULL;

       index = *ppos >> PAGE_CACHE_SHIFT;

       next_index = index;

       prev_index = ra.prev_page;

       last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;

       offset = *ppos & ~PAGE_CACHE_MASK;

       isize = i_size_read(inode);

       if (!isize)

              goto out;

       end_index = (isize - 1) >> PAGE_CACHE_SHIFT;

       for (;;) {

              struct page *page;

              unsigned long nr, ret;

              /* nr is the maximum number of bytes to copy from this page */

              nr = PAGE_CACHE_SIZE;

              if (index >= end_index) {

                     if (index > end_index)

                            goto out;

                     nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;

                     if (nr <= offset) {

                            goto out;

                     }

              }

              nr = nr - offset;

              cond_resched();

              if (index == next_index)

                     next_index = page_cache_readahead(mapping, &ra, filp,

                                   index, last_index - index);

find_page:

              page = find_get_page(mapping, index); //在缓存中查找

              if (unlikely(page == NULL)) {

                     handle_ra_miss(mapping, &ra, index);

                     goto no_cached_page; //没有找到

              }

              if (!PageUptodate(page)) //Uptodate

                     goto page_not_up_to_date;

page_ok: //找到了相关缓存页面

              ret = actor(desc, page, offset, nr); //拷贝数据到用户缓冲区

              //更新一些变量值

              offset += ret;

              index += offset >> PAGE_CACHE_SHIFT;

              offset &= ~PAGE_CACHE_MASK;

              page_cache_release(page);

              if (ret == nr && desc->count)

                     continue; //未完成,进入下一次循环

              goto out; //完成

page_not_up_to_date:

              /* Get exclusive access to the page ... */

              lock_page(page);

              /* Did it get truncated before we got the lock? */

              if (!page->mapping) {

                     unlock_page(page);

                     page_cache_release(page);

                     continue;

              }

              /* Did somebody else fill it already? */

              if (PageUptodate(page)) {

                     unlock_page(page);

                     goto page_ok;

              }

readpage: //读操作

              /* Start the actual read. The read will unlock the page. */

              error = mapping->a_ops->readpage(filp, page); //真正的读操作

              ……             

              /* nr is the maximum number of bytes to copy from this page */

              nr = PAGE_CACHE_SIZE;

              if (index == end_index) {

                     nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;

                     if (nr <= offset) {

                            page_cache_release(page);

                            goto out;

                     }

              }

              nr = nr - offset;

              goto page_ok;

readpage_error:

              /* UHHUH! A synchronous read error occurred. Report it */

              desc->error = error;

              page_cache_release(page);

              goto out;

no_cached_page: //分配一个新的页面,比将它链入缓存树中。

              /*

              * Ok, it wasn't cached, so we need to create a new

              * page..

              */

              if (!cached_page) {

                     cached_page = page_cache_alloc_cold(mapping);

                     if (!cached_page) {

                            desc->error = -ENOMEM;

                            goto out;

                     }

              }

              error = add_to_page_cache_lru(cached_page, mapping,

                                          index, GFP_KERNEL);

              page = cached_page;

              cached_page = NULL;

              goto readpage;

       }

out:

       *_ra = ra;

       *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;

       if (cached_page)

              page_cache_release(cached_page);

       if (filp)

              file_accessed(filp);

}

3、从设备读取

对于不同的文件系统有不同的address_space,而且有不同的address_space_operations,对于ext2文件系统来说,这个是如下一个结构:

const struct address_space_operations ext2_aops = {

       .readpage               = ext2_readpage,

       .readpages             = ext2_readpages,

       .writepage             = ext2_writepage,

       .sync_page            = block_sync_page,

       .prepare_write        = ext2_prepare_write,

       .commit_write              = generic_commit_write,

       .bmap                   = ext2_bmap,

       .direct_IO              = ext2_direct_IO,

       .writepages            = ext2_writepages,

       .migratepage          = buffer_migrate_page,

};

可见,这个readpage()便是ext2_readpage()它负责从设备中读取一个页面。

static int ext2_readpage(struct file *file, struct page *page)

{

       return mpage_readpage(page, ext2_get_block);

}

/*

* This isn't called much at all

*/

int mpage_readpage(struct page *page, get_block_t get_block)

{

       struct bio *bio = NULL;

       sector_t last_block_in_bio = 0;

       struct buffer_head map_bh;

       unsigned long first_logical_block = 0;

       clear_buffer_mapped(&map_bh);

       bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,

                     &map_bh, &first_logical_block, get_block);

       if (bio)

              mpage_bio_submit(READ, bio);

       return 0;

}

这个函数最终将读请求转成submit_bio(),之后就是通用块层的事情了。

posted @ 2008-08-22 15:16 puppy 阅读(268) | 评论 (0)编辑 收藏

VFS中进程(process)、filedentryinode之间的关系

R.wen

       下图(LDD3fig12-2)清晰地描述了这几者之间的关系。

       进程每打开一个文件,就会有一个file结构与之对应。同一个进程可以多次打开同一个文件而得到多个不同的file结构,file结构描述了被打开文件的属性,读写的偏移指针等等当前信息。

       两个不同的file结构可以对应同一个dentry结构。进程多次打开同一个文件时,对应的只有一个dentry结构。Dentry结构存储目录项和对应文件(inode)的信息。

       在存储介质中,每个文件对应唯一的inode结点,但是,每个文件又可以有多个文件名。即可以通过不同的文件名访问同一个文件。这里多个文件名对应一个文件的关系在数据结构中表示就是dentryinode的关系。

       Inode中不存储文件的名字,它只存储节点号;而dentry则保存有名字和与其对应的节点号,所以就可以通过不同的dentry访问同一个inode

       不同的dentry则是同个文件链接(ln命令)来实现的。

R.wen

 

       下图(LDD3fig12-2)清晰地描述了这几者之间的关系。

       进程每打开一个文件,就会有一个file结构与之对应。同一个进程可以多次打开同一个文件而得到多个不同的file结构,file结构描述了被打开文件的属性,读写的偏移指针等等当前信息。

       两个不同的file结构可以对应同一个dentry结构。进程多次打开同一个文件时,对应的只有一个dentry结构。Dentry结构存储目录项和对应文件(inode)的信息。

       在存储介质中,每个文件对应唯一的inode结点,但是,每个文件又可以有多个文件名。即可以通过不同的文件名访问同一个文件。这里多个文件名对应一个文件的关系在数据结构中表示就是dentryinode的关系。

       Inode中不存储文件的名字,它只存储节点号;而dentry则保存有名字和与其对应的节点号,所以就可以通过不同的dentry访问同一个inode

       不同的dentry则是同个文件链接(ln命令)来实现的。

posted @ 2008-08-22 15:14 puppy 阅读(658) | 评论 (0)编辑 收藏

Linux文件读写(1)--页面缓冲(Page Cache)的管理

R.wen

一、本文分析文件的读写过程。当用户进程发出一个read()系统调用时,它首先通过VFSdisk cache中去查找相应的文件块有没有已经被缓存起来,如果有,则不需要再次从设备中去读,直接从CACHE中去拷贝给用户缓冲区就可以了,否则它就要先分配一个缓冲页面,并且将其加入到对应的inode节点的address_space中,再调用address_spacereadpage()函数,通过submit_bio()向设备发送一个请求,将所需的文件块从设备中读取出来存放在先前分配的缓冲页面中,最后再从该页面中将所需数据拷贝到用户缓冲区。

 

1

二、页面缓冲(Page Cache)的管理

页面缓冲的核心数据结构是struct address_space

struct backing_dev_info;

struct address_space {

       struct inode           *host;            /* owner: inode, block_device */

       struct radix_tree_root    page_tree;       /* radix tree of all pages */

       rwlock_t        tree_lock;       /* and rwlock protecting it */

       unsigned int           i_mmap_writable;/* count VM_SHARED mappings */

       struct prio_tree_root      i_mmap;         /* tree of private and shared mappings */

       struct list_head       i_mmap_nonlinear;/*list VM_NONLINEAR mappings */

       spinlock_t              i_mmap_lock; /* protect tree, count, list */

       unsigned int           truncate_count;      /* Cover race condition with truncate */

       unsigned long         nrpages; /* number of total pages */

       pgoff_t                  writeback_index;/* writeback starts here */

       const struct address_space_operations *a_ops;   /* methods */

       unsigned long         flags;             /* error bits/gfp mask */

       struct backing_dev_info *backing_dev_info; /* device readahead, etc */

       spinlock_t              private_lock;   /* for use by the address_space */

       struct list_head       private_list;     /* ditto */

       struct address_space     *assoc_mapping;    /* ditto */

} __attribute__((aligned(sizeof(long))));

如下图2,缓冲页面的是通过一个基数树(Radix Tree)来管理的,这是一个简单但非常高效的树结构。

 

2

由图2可以看到,当RADIX_TREE_MAP_SHIFT6(即每个节点有2^664slot)且树高是1时,它可以寻址大小为64个页面(256kb)的文件,同样,当树高为2时,它可以寻址64*64个页面(16M)大小的文件,如此下去,在32位的系统中,树高为6级,(最高级只有2位:32-6*5),所以它可以寻址2^32-1个页面大小的文件,约为16TB大小,所以目前来说已经足够了。

基数树的遍历也是很简单,且类似于虚拟线性地址的转换过程。只要给定树根及文件偏移,就可以找到相应的缓存页面。再如图2右,如果在文件中的偏移为131个页面,这个偏移值的高6位就是第一级偏移,而低6位就是在第二级的偏移,依此类推。如对于偏移值131(10000011),高6位值是131>>6 = 2,所以它在第一级的偏移是2,而在第2级的领衔就是低6位,值为3,即偏移为3,所以得到的结果如图2右方所示。

#define RADIX_TREE_MAP_SHIFT   (CONFIG_BASE_SMALL ? 4 : 6)

#define RADIX_TREE_MAP_SIZE      (1UL << RADIX_TREE_MAP_SHIFT)

#define RADIX_TREE_MAX_TAGS 2

#define RADIX_TREE_TAG_LONGS \    //其值为64

       ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)

struct radix_tree_node {

       unsigned int    height;            /* Height from the bottom */

       unsigned int    count;

       struct rcu_head      rcu_head;

       void        *slots[RADIX_TREE_MAP_SIZE];

       unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];

};

struct radix_tree_path {

       struct radix_tree_node *node;

       int offset;

};

struct radix_tree_node {

       unsigned int    height;            /* Height from the bottom */

       unsigned int    count;

       struct rcu_head      rcu_head;

       void        *slots[RADIX_TREE_MAP_SIZE];

       unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];

};

以上是相关的几个数据结构,第一个为树根结点结构,第二个用于路径查找,第三个就是树的节点结构。

注意节点结构中的tags域,这个一个典型的用空间换时间的应用。它是一个二维数组,用于记录该节点下面的子节点有没有相应的标志。目前RADIX_TREE_MAX_TAGS2,表示只记录两个标志,其中tags[0]PAGE_CACHE_DIRTYtags[1]PAGE_CACHE_WRITEBACK。它表示,如果当前节点的tags[0]值为1,那么它的子树节点就存在PAGE_CACHE_DIRTY节点,否则这个子树分枝就不存在着这样的节点,就不必再查找这个子树了。比如在查找PG_dirty的页面时,就不需要遍历整个树,而可以跳过那些tags[0]0值的子树,这样就提高了查找效率。

posted @ 2008-08-22 15:14 puppy 阅读(1419) | 评论 (0)编辑 收藏
s3c2410 电源管理(3)--s3c2410 pm.c
R.wen

再看看(2)中的enter_state():

/**
*    enter_state - Do common work of entering low-power state.
*    @state:        pm_state structure for state we're entering.
*
*    Make sure we're the only ones trying to enter a sleep state. Fail
*    if someone has beat us to it, since we don't want anything weird to
*    happen when we wake up.
*    Then, do the setup for suspend, enter the state, and cleaup (after
*    we've woken up).
*/

static int enter_state(suspend_state_t state)
{
    int error;

    //获得锁, 参见注释
    if (down_trylock(&pm_sem))
        return -EBUSY;

    //挂起磁盘的请求, 不是我我们的请求
    if (state == PM_SUSPEND_DISK) {
        error = pm_suspend_disk();
        goto Unlock;
    }

    //prepare阶段
    pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
    if ((error = suspend_prepare(state)))
        goto Unlock;

    //进入阶段
    pr_debug("PM: Entering %s sleep\n", pm_states[state]);
    error = suspend_enter(state);
   
    //完成挂起, 恢复状态
    pr_debug("PM: Finishing wakeup.\n");
    suspend_finish(state);
Unlock:
    up(&pm_sem);
    return error;
}


可以看到, 状态的转换分三个阶段, 分别为prepare, enter, finish.
我们已经再二中说明, 这三个阶段通过体系无关的函数,
最终会调用与体系结构相关的函数.
他们分别是:
pm_ops->prepare(state)
pm_ops->enter(state)
pm_ops->finish(state)


这个pm_ops就是在体系结构初始化的时候注册进来的,

接着看arch/arm/mach-s3c2410/pm.c

/*
* Set to PM_DISK_FIRMWARE so we can quickly veto suspend-to-disk.
*/
static struct pm_ops s3c2410_pm_ops = {
    .pm_disk_mode    = PM_DISK_FIRMWARE,
    .prepare    = s3c2410_pm_prepare,
    .enter        = s3c2410_pm_enter,
    .finish        = s3c2410_pm_finish,
};

/* s3c2410_pm_init
*
* Attach the power management functions. This should be called
* from the board specific initialisation if the board supports
* it.
*/

int __init s3c2410_pm_init(void)
{
    printk("S3C2410 Power Management, (c) 2004 Simtec Electronics\n");

    pm_set_ops(&s3c2410_pm_ops);
    return 0;
}

这就是实现三个状态转换的三个钩子函数.
/**
*    pm_set_ops - Set the global power method table.
*    @ops:    Pointer to ops structure.
*/
//这个函数较为简单, 只是将/kerenel/power/main.c里的全局变量pm_ops设置成
s3c2410_pm_ops而已了.
//这就完成了这个全局变量的初始化.后续对pm_ops的访问实质上都是访问
s3c2410_pm_ops.
void pm_set_ops(struct pm_ops * ops)
{
    down(&pm_sem);
    pm_ops = ops;
    up(&pm_sem);
}


接着再看他们的实现:

先从最简单的开始,

/*
* Called after processes are frozen, but before we shut down devices.
*/
static int s3c2410_pm_prepare(suspend_state_t state)
{
    return 0;
}

/*
* Called after devices are re-setup, but before processes are thawed.
*/
static int s3c2410_pm_finish(suspend_state_t state)
{
    return 0;
}

如上, 可以看到, prepare和finishi在这个体系中都是空操作, 就是说, 对于s3c2410,
无需特殊的工作.

而这个结构的核心就是剩下的s3c2410_pm_enter了. 它真正实现suspend/resume
的状态转换.



#define any_allowed(mask, allow) (((mask) & (allow)) != (allow))

/* s3c2410_pm_enter
*
* central control for sleep/resume process
*/

static int s3c2410_pm_enter(suspend_state_t state)
{
    unsigned long regs_save[16]; //用于保存16个通用寄存器的栈
    unsigned long tmp;

    /* ensure the debug is initialised (if enabled) */

    s3c2410_pm_debug_init();

    DBG("s3c2410_pm_enter(%d)\n", state);

    if (state != PM_SUSPEND_MEM) {
        printk(KERN_ERR PFX "error: only PM_SUSPEND_MEM supported\n");
        return -EINVAL;
    }

    /* check if we have anything to wake-up with... bad things seem
    * to happen if you suspend with no wakeup (system will often
    * require a full power-cycle)
    */
    //检查允许的唤醒中断
    if (!any_allowed(s3c_irqwake_intmask, s3c_irqwake_intallow) &&
        !any_allowed(s3c_irqwake_eintmask, s3c_irqwake_eintallow)) {
        printk(KERN_ERR PFX "No sources enabled for wake-up!\n");
        printk(KERN_ERR PFX "Aborting sleep\n");
        return -EINVAL;
    }

    /* prepare check area if configured */
    //一些准备工作
    s3c2410_pm_check_prepare();

    /* store the physical address of the register recovery block */
    //寄存器的物理地址
    s3c2410_sleep_save_phys = virt_to_phys(regs_save);

    DBG("s3c2410_sleep_save_phys=0x%08lx\n", s3c2410_sleep_save_phys);

    /* ensure at least GESTATUS3 has the resume address */
    //将系统被唤醒后执行的函数s3c2410_cpu_resume物理地址写入S3C2410_GSTATUS3.
    __raw_writel(virt_to_phys(s3c2410_cpu_resume), S3C2410_GSTATUS3);

    DBG("GSTATUS3 0x%08x\n", __raw_readl(S3C2410_GSTATUS3));
    DBG("GSTATUS4 0x%08x\n", __raw_readl(S3C2410_GSTATUS4));

    /* save all necessary core registers not covered by the drivers */
    //保存不属于driver的核心寄存器, driver的各自保存
    s3c2410_pm_do_save(gpio_save, ARRAY_SIZE(gpio_save));
    s3c2410_pm_do_save(irq_save, ARRAY_SIZE(irq_save));
    s3c2410_pm_do_save(core_save, ARRAY_SIZE(core_save));
    s3c2410_pm_do_save(uart_save, ARRAY_SIZE(uart_save));

    /* set the irq configuration for wake */
    //设置外部中断用于唤醒
    s3c2410_pm_configure_extint();

    DBG("sleep: irq wakeup masks: %08lx,%08lx\n",
        s3c_irqwake_intmask, s3c_irqwake_eintmask);


    //开中断??
    __raw_writel(s3c_irqwake_intmask, S3C2410_INTMSK);
    __raw_writel(s3c_irqwake_eintmask, S3C2410_EINTMASK);

    /* ack any outstanding external interrupts before we go to sleep */

    __raw_writel(__raw_readl(S3C2410_EINTPEND), S3C2410_EINTPEND);

    /* flush cache back to ram */

    arm920_flush_kern_cache_all();

    s3c2410_pm_check_store();

    /* send the cpu to sleep... */
    //关闭时钟
    __raw_writel(0x00, S3C2410_CLKCON); /* turn off clocks over sleep */


    //系统进入睡眠, 寄存器值保存在(regs_save)中.
    //这个函数和上面的s3c2410_cpu_resume(),都是汇编实现的, 在sleep.S中
    s3c2410_cpu_suspend(regs_save);


    //当接收到一个外部中断时,系统开始恢复
    /* restore the cpu state */

    cpu_init();

    /* unset the return-from-sleep flag, to ensure reset */

    tmp = __raw_readl(S3C2410_GSTATUS2);
    tmp &= S3C2410_GSTATUS2_OFFRESET;
    __raw_writel(tmp, S3C2410_GSTATUS2);

    /* restore the system state */
    //上面save的逆操作
    s3c2410_pm_do_restore_core(core_save, ARRAY_SIZE(core_save));
    s3c2410_pm_do_restore(gpio_save, ARRAY_SIZE(gpio_save));
    s3c2410_pm_do_restore(irq_save, ARRAY_SIZE(irq_save));
    s3c2410_pm_do_restore(uart_save, ARRAY_SIZE(uart_save));


    //一下均是上面的一些准备工作的逆操作
    s3c2410_pm_debug_init();

    /* check what irq (if any) restored the system */

    DBG("post sleep: IRQs 0x%08x, 0x%08x\n",
        __raw_readl(S3C2410_SRCPND),
        __raw_readl(S3C2410_EINTPEND));

    s3c2410_pm_show_resume_irqs(IRQ_EINT0, __raw_readl(S3C2410_SRCPND),
                    s3c_irqwake_intmask);

    s3c2410_pm_show_resume_irqs(IRQ_EINT4-4, __raw_readl(S3C2410_EINTPEND),
                    s3c_irqwake_eintmask);

    DBG("post sleep, preparing to return\n");

    s3c2410_pm_check_restore();

    /* ok, let's return from sleep */

    DBG("S3C2410 PM Resume (post-restore)\n");
    return 0;
}



a. 首先是通过

if (!any_allowed(s3c_irqwake_intmask, s3c_irqwake_intallow) &&
        !any_allowed(s3c_irqwake_eintmask, s3c_irqwake_eintallow)) {
            ......
    }
检查唤醒中断, 如果不存在唤醒中断源, 那系统就不允许suspend,
否则就没人将它唤醒了.

/* state for IRQs over sleep */

/* default is to allow for EINT0..EINT15, and IRQ_RTC as wakeup sources
*
* set bit to 1 in allow bitfield to enable the wakeup settings on it
*/
//默认是[4-15]和RTC
unsigned long s3c_irqwake_intallow    = 1L << (IRQ_RTC - IRQ_EINT0) | 0xfL;
unsigned long s3c_irqwake_intmask    = 0xffffffffL;
unsigned long s3c_irqwake_eintallow    = 0x0000fff0L;
unsigned long s3c_irqwake_eintmask    = 0xffffffffL;


b.接着是
    /* ensure at least GESTATUS3 has the resume address */
    //将系统被唤醒后执行的函数s3c2410_cpu_resume物理地址写入S3C2410_GSTATUS3.
    __raw_writel(virt_to_phys(s3c2410_cpu_resume), S3C2410_GSTATUS3);


s3c2410_cpu_resume/s3c2410_cpu_suspend 这对逆操作定义在sleep.S中, 是汇编的实现.


c. 然后就是s3c2410_pm_do_save/s3c2410_pm_do_restore了,
这个宏定义就是用于保存/恢复核心的寄存器值.


===========================
参考: Document/pm.tx Documentation/arm/Samsung-S3C24XX/Suspend.txt
posted @ 2008-08-22 15:04 puppy 阅读(562) | 评论 (0)编辑 收藏

 

s3c2410 电源管理(2)--core function

R.wen

这部分说明kernel里面的电源管理的核心函数
这部分的代码在/kernel/power目录中

1. 我们在(1)中看到apm_suspend()调用以下这个函数, 我们就从这里开始

typedef int __bitwise suspend_state_t;

#define PM_SUSPEND_ON        ((__force suspend_state_t) 0)
#define PM_SUSPEND_STANDBY    ((__force suspend_state_t) 1)
#define PM_SUSPEND_MEM        ((__force suspend_state_t) 3)
#define PM_SUSPEND_DISK        ((__force suspend_state_t) 4)
#define PM_SUSPEND_MAX        ((__force suspend_state_t) 5)


/**
*    pm_suspend - Externally visible function for suspending system.
*    @state:        Enumarted value of state to enter.
*
*    Determine whether or not value is within range, get state
*    structure, and enter (above).
*/
//注意这里的注释, Externally visible function for suspending system.
int pm_suspend(suspend_state_t state)
{   
    //arm apm传入的是PM_SUSPEND_MEM
    if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
        return enter_state(state);
    return -EINVAL;
}


/**
*    enter_state - Do common work of entering low-power state.
*    @state:        pm_state structure for state we're entering.
*
*    Make sure we're the only ones trying to enter a sleep state. Fail
*    if someone has beat us to it, since we don't want anything weird to
*    happen when we wake up.
*    Then, do the setup for suspend, enter the state, and cleaup (after
*    we've woken up).
*/

static int enter_state(suspend_state_t state)
{
    int error;

    //获得锁, 参见注释
    if (down_trylock(&pm_sem))
        return -EBUSY;

    //挂起磁盘的请求, 不是我我们的请求
    if (state == PM_SUSPEND_DISK) {
        error = pm_suspend_disk();
        goto Unlock;
    }

    //prepare阶段
    pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
    if ((error = suspend_prepare(state)))
        goto Unlock;

    //进入阶段
    pr_debug("PM: Entering %s sleep\n", pm_states[state]);
    error = suspend_enter(state);
   
    //完成挂起, 恢复状态
    pr_debug("PM: Finishing wakeup.\n");
    suspend_finish(state);
Unlock:
    up(&pm_sem);
    return error;
}



2.1 准备阶段, 为状态变换做准备
/**
*    suspend_prepare - Do prep work before entering low-power state.
*    @state:        State we're entering.
*
*    This is common code that is called for each state that we're
*    entering. Allocate a console, stop all processes, then make sure
*    the platform can enter the requested state.
*/

static int suspend_prepare(suspend_state_t state)
{
    int error = 0;
    unsigned int free_pages;

    if (!pm_ops || !pm_ops->enter)
        return -EPERM;

    pm_prepare_console();

    disable_nonboot_cpus();

    if (num_online_cpus() != 1) {
        error = -EPERM;
        goto Enable_cpu;
    }

    //进程处理
    if (freeze_processes()) {
        error = -EAGAIN;
        goto Thaw;
    }

    //内存处理
    if ((free_pages = nr_free_pages()) < FREE_PAGE_NUMBER) {
        pr_debug("PM: free some memory\n");
        shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
        if (nr_free_pages() < FREE_PAGE_NUMBER) {
            error = -ENOMEM;
            printk(KERN_ERR "PM: No enough memory\n");
            goto Thaw;
        }
    }

    //调用体系结构相关的函数, 这是在系统初始化的时候注册的.
    if (pm_ops->prepare) {
        if ((error = pm_ops->prepare(state)))
            goto Thaw;
    }

    //挂起设备
    if ((error = device_suspend(PMSG_SUSPEND))) {
        printk(KERN_ERR "Some devices failed to suspend\n");
        goto Finish;
    }
    return 0;
Finish:
    if (pm_ops->finish)
        pm_ops->finish(state);
Thaw:
    thaw_processes();
Enable_cpu:
    enable_nonboot_cpus();
    pm_restore_console();
    return error;
}

2.2挂起设备
/**
*    device_suspend - Save state and stop all devices in system.
*    @state:        Power state to put each device in.
*
*    Walk the dpm_active list, call ->suspend() for each device, and move
*    it to dpm_off.
*    Check the return value for each. If it returns 0, then we move the
*    the device to the dpm_off list. If it returns -EAGAIN, we move it to
*    the dpm_off_irq list. If we get a different error, try and back out.
*
*    If we hit a failure with any of the devices, call device_resume()
*    above to bring the suspended devices back to life.
*
*/

int device_suspend(pm_message_t state)
{
    int error = 0;

    down(&dpm_sem);
    down(&dpm_list_sem);
    //遍历设备链表, 当一个设备被注册进系统时, 它同时会被加入到这个dpm_active队列中
    while (!list_empty(&dpm_active) && error == 0) {
        struct list_head * entry = dpm_active.prev;
        struct device * dev = to_device(entry);

        get_device(dev);
        up(&dpm_list_sem);

        //挂起这个设备
        error = suspend_device(dev, state);

        down(&dpm_list_sem);

        /* Check if the device got removed */
        //加入off队列, 用于以后唤醒
        if (!list_empty(&dev->power.entry)) {
            /* Move it to the dpm_off or dpm_off_irq list */
            if (!error) {
                list_del(&dev->power.entry);
                list_add(&dev->power.entry, &dpm_off);
            } else if (error == -EAGAIN) {
                list_del(&dev->power.entry);
                list_add(&dev->power.entry, &dpm_off_irq);
                error = 0;
            }
        }
        if (error)
            printk(KERN_ERR "Could not suspend device %s: "
                "error %d\n", kobject_name(&dev->kobj), error);
        put_device(dev);
    }
    up(&dpm_list_sem);
    if (error) { //出错了! 恢复原来的状态
        /* we failed... before resuming, bring back devices from
        * dpm_off_irq list back to main dpm_off list, we do want
        * to call resume() on them, in case they partially suspended
        * despite returning -EAGAIN
        */
        while (!list_empty(&dpm_off_irq)) {
            struct list_head * entry = dpm_off_irq.next;
            list_del(entry);
            list_add(entry, &dpm_off);
        }
        dpm_resume();
    }
    up(&dpm_sem);
    return error;
}


/**
*    suspend_device - Save state of one device.
*    @dev:    Device.
*    @state:    Power state device is entering.
*/

int suspend_device(struct device * dev, pm_message_t state)
{
    int error = 0;

    down(&dev->sem);
    if (dev->power.power_state.event) {
        dev_dbg(dev, "PM: suspend %d-->%d\n",
            dev->power.power_state.event, state.event);
    }
    if (dev->power.pm_parent
            && dev->power.pm_parent->power.power_state.event) {
        dev_err(dev,
            "PM: suspend %d->%d, parent %s already %d\n",
            dev->power.power_state.event, state.event,
            dev->power.pm_parent->bus_id,
            dev->power.pm_parent->power.power_state.event);
    }
   
    //保留原来的状态
    dev->power.prev_state = dev->power.power_state;

    if (dev->bus && dev->bus->suspend && !dev->power.power_state.event) {
        dev_dbg(dev, "suspending\n");

        //执行BUS的suspend, bus的suspend再去执行dev的suspend
        error = dev->bus->suspend(dev, state);
    }
    up(&dev->sem);
    return error;
}

为了说明它说如何调用bus的suspend的, 这里插入一段设备的注册过程的描述:

static int __init s3c_arch_init(void)
{
    int ret;

    // do the correct init for cpu

    if (cpu == NULL)
        panic("s3c_arch_init: NULL cpu\n");

    ret = (cpu->init)();
    if (ret != 0)
        return ret;

    //这个board是全局变量, 就是下面的smdk2440_board
    if (board != NULL) {
        struct platform_device **ptr = board->devices;
        int i;

        for (i = 0; i < board->devices_count; i++, ptr++) {
            //这个就是注册设备的函数, bus为platform
            ret = platform_device_register(*ptr);

            if (ret) {
                printk(KERN_ERR "s3c24xx: failed to add board device %s (%d) @%

p\n", (*ptr)->name, ret, *ptr);
            }
        }

        /* mask any error, we may not need all these board
        * devices */
        ret = 0;
    }

    return ret;
}


// 定义在mach-smdk2440.c
static struct platform_device *smdk2440_devices[] __initdata = {
    &s3c_device_usb,
    &s3c_device_lcd,
    &s3c_device_wdt,
    &s3c_device_i2c,
    &s3c_device_iis,
};

static struct s3c24xx_board smdk2440_board __initdata = {
    .devices       = smdk2440_devices,
    .devices_count = ARRAY_SIZE(smdk2440_devices)
};


我们看到, 就是这个platform_device_register()将上面数组中的设备(这些设备在devs.c中定义)注册进

platform bus中去的.


/**
*    platform_device_register - add a platform-level device
*    @pdev:    platform device we're adding
*
*/
int platform_device_register(struct platform_device * pdev)
{
    int i, ret = 0;

    if (!pdev)
        return -EINVAL;

    if (!pdev->dev.parent)
        pdev->dev.parent = &platform_bus;

    //这个dev bus被初始化为platform_bus_type, 我们只关心这里
    pdev->dev.bus = &platform_bus_type;

    if (pdev->id != -1)
        snprintf(pdev->dev.bus_id, BUS_ID_SIZE, "%s.%u", pdev->name, pdev->id);
    else
        strlcpy(pdev->dev.bus_id, pdev->name, BUS_ID_SIZE);

    for (i = 0; i < pdev->num_resources; i++) {
        struct resource *p, *r = &pdev->resource[i];

        if (r->name == NULL)
            r->name = pdev->dev.bus_id;

        p = r->parent;
        if (!p) {
            if (r->flags & IORESOURCE_MEM)
                p = &iomem_resource;
            else if (r->flags & IORESOURCE_IO)
                p = &ioport_resource;
        }

        if (p && request_resource(p, r)) {
            printk(KERN_ERR
                   "%s: failed to claim resource %d\n",
                   pdev->dev.bus_id, i);
            ret = -EBUSY;
            goto failed;
        }
    }

    pr_debug("Registering platform device '%s'. Parent at %s\n",
        pdev->dev.bus_id, pdev->dev.parent->bus_id);

    ret = device_register(&pdev->dev);
    if (ret == 0)
        return ret;

failed:
    while (--i >= 0)
        if (pdev->resource[i].flags & (IORESOURCE_MEM|IORESOURCE_IO))
            release_resource(&pdev->resource[i]);
    return ret;
}


再接着看看这个结构:

struct bus_type platform_bus_type = {
    .name        = "platform",
    .match        = platform_match,
   
    //下面两个就是电源管理用的函数
    .suspend    = platform_suspend,
    .resume        = platform_resume,
};

我们在这里就可以清楚的看到, 它是会调用设备驱动的suspend实现的.
所以说, 系统挂起是, 设备也应该做相应的工作, 由于设备的特殊性, 这些就是留在设备里面来实现了.

static int platform_suspend(struct device * dev, pm_message_t state)
{
    int ret = 0;

    if (dev->driver && dev->driver->suspend) {
        ret = dev->driver->suspend(dev, state, SUSPEND_DISABLE);
        if (ret == 0)
            ret = dev->driver->suspend(dev, state, SUSPEND_SAVE_STATE);
        if (ret == 0)
            ret = dev->driver->suspend(dev, state, SUSPEND_POWER_DOWN);
    }
    return ret;
}



3. enter阶段,
完成了prepare阶段后, 就是enter阶段了,即是进入了状态变换阶段了.
这就是:

static int suspend_enter(suspend_state_t state)
{
    int error = 0;
    unsigned long flags;

    local_irq_save(flags);

    if ((error = device_power_down(PMSG_SUSPEND))) {
        printk(KERN_ERR "Some devices failed to power down\n");
        goto Done;
    }
    error = pm_ops->enter(state);
    device_power_up();
Done:
    local_irq_restore(flags);
    return error;
}

我们看到,所有的工作都在pm_ops->enter(state)中去做了.
它完成了suspend/resume的状态转换.

struct pm_ops {
    suspend_disk_method_t pm_disk_mode;
    int (*prepare)(suspend_state_t state);
    int (*enter)(suspend_state_t state);
    int (*finish)(suspend_state_t state);
};

这个结构在系统初始化是会初始化, 且每个体系结构的pm_os是不同的,
如s3c24xx的为:

/*
* Set to PM_DISK_FIRMWARE so we can quickly veto suspend-to-disk.
*/
static struct pm_ops s3c2410_pm_ops = {
    .pm_disk_mode    = PM_DISK_FIRMWARE,
    .prepare    = s3c2410_pm_prepare,
    .enter        = s3c2410_pm_enter,
    .finish        = s3c2410_pm_finish,
};

定义在arch/arm/mach-s3c2410/pm.c中.

我们在下一节再细看这个pm的实现.

4. finish阶段

/**
*    suspend_finish - Do final work before exiting suspend sequence.
*    @state:        State we're coming out of.
*
*    Call platform code to clean up, restart processes, and free the
*    console that we've allocated. This is not called for suspend-to-disk.
*/
我们看到, 这里是enter_state的逆操作.
static void suspend_finish(suspend_state_t state)
{
    device_resume();
    if (pm_ops && pm_ops->finish)
        pm_ops->finish(state); //体系相关的操作
    thaw_processes();
    enable_nonboot_cpus();
    pm_restore_console();
}

5. 系统resume

/**
*    device_resume - Restore state of each device in system.
*
*    Walk the dpm_off list, remove each entry, resume the device,
*    then add it to the dpm_active list.
*/

void device_resume(void)
{
    down(&dpm_sem);
    dpm_resume();
    up(&dpm_sem);
}

void dpm_resume(void)
{
    down(&dpm_list_sem);
    while(!list_empty(&dpm_off)) { //在device_suspend()入列的dev
        struct list_head * entry = dpm_off.next;
        struct device * dev = to_device(entry);

        get_device(dev);
        list_del_init(entry);
        list_add_tail(entry, &dpm_active);

        up(&dpm_list_sem);
        if (!dev->power.prev_state.event)
            resume_device(dev); //对每个设备
        down(&dpm_list_sem);
        put_device(dev);
    }
    up(&dpm_list_sem);
}



/**
*    resume_device - Restore state for one device.
*    @dev:    Device.
*
*/

int resume_device(struct device * dev)
{
    int error = 0;

    down(&dev->sem);
    if (dev->power.pm_parent
            && dev->power.pm_parent->power.power_state.event) {
        dev_err(dev, "PM: resume from %d, parent %s still %d\n",
            dev->power.power_state.event,
            dev->power.pm_parent->bus_id,
            dev->power.pm_parent->power.power_state.event);
    }
    if (dev->bus && dev->bus->resume) {
        dev_dbg(dev,"resuming\n");
        error = dev->bus->resume(dev); //bus的resume, 相对应我们说的bus的suspend
    }
    up(&dev->sem);
    return error;
}


6. 体系相关的操作,
到这里, 我们只是剩下如下这些函数操作没说了, 这是真正执行硬件指令的操作.
/*
* Set to PM_DISK_FIRMWARE so we can quickly veto suspend-to-disk.
*/
static struct pm_ops s3c2410_pm_ops = {
    .pm_disk_mode    = PM_DISK_FIRMWARE,
    .prepare    = s3c2410_pm_prepare,
    .enter        = s3c2410_pm_enter,
    .finish        = s3c2410_pm_finish,
};
posted @ 2008-08-22 15:03 puppy 阅读(798) | 评论 (2)编辑 收藏
s3c2410 电源管理(1)--apm.c
R.wen


由于arm系统中没有bios设备, 所以只能为arm系统创建一个虚拟的字符设备与用户空间进行通讯.
这就是/arch/arm/kernel/amp.c


1. 工作原理:

    这个apm中实现一个misc设备,实质上也是一个字符设备, misc设备的主设备号是10, 而apm_bios作为一

个misc设备, 次设备号是134. 定义为:
/*
* The apm_bios device is one of the misc char devices.
* This is its minor number.
*/
#define APM_MINOR_DEV    134

    这个apm_bios设备通过ioctl系统调用和用户空间进行通讯, 即当用户进程通过ioctl发来suspend命令时

, 它就传给内核, 使系统进入suspend状态.


2. 初始化


static int __init apm_init(void)
{
    int ret;

    if (apm_disabled) {
        printk(KERN_NOTICE "apm: disabled on user request.\n");
        return -ENODEV;
    }

    if (PM_IS_ACTIVE()) {
        printk(KERN_NOTICE "apm: overridden by ACPI.\n");
        return -EINVAL;
    }

    pm_active = 1;

    //创建一个线程, 用于处理事件队列, 工作函数是kapmd
    //这个线程好像在arm中没有作用?
    ret = kernel_thread(kapmd, NULL, CLONE_KERNEL);
    if (ret < 0) {
        pm_active = 0;
        return ret;
    }

    //通过proc向用户空间输出apm信息
#ifdef CONFIG_PROC_FS
    create_proc_info_entry("apm", 0, NULL, apm_get_info);
#endif

    //注册misc设备
    ret = misc_register(&apm_device);
    if (ret != 0) {
        remove_proc_entry("apm", NULL);

        pm_active = 0;
        wake_up(&kapmd_wait);
        wait_for_completion(&kapmd_exit);
    }

    return ret;
}

注册的结构为:
static struct file_operations apm_bios_fops = {
    .owner        = THIS_MODULE,
    .read        = apm_read,
    .poll        = apm_poll,
    .ioctl        = apm_ioctl,
    .open        = apm_open,
    .release    = apm_release,
};

static struct miscdevice apm_device = {
    .minor        = APM_MINOR_DEV,
    .name        = "apm_bios",
    .fops        = &apm_bios_fops
};


3. 结构函数的实现


当一个用户进程打开apm_bios设备时, 它就会调用这个函数

static int apm_open(struct inode * inode, struct file * filp)
{
    struct apm_user *as;
    //分配一个apm_user结构, 来表示一个用户进程
    as = (struct apm_user *)kmalloc(sizeof(*as), GFP_KERNEL);
    if (as) {
        memset(as, 0, sizeof(*as));

        /*
        * XXX - this is a tiny bit broken, when we consider BSD
        * process accounting. If the device is opened by root, we
        * instantly flag that we used superuser privs. Who knows,
        * we might close the device immediately without doing a
        * privileged operation -- cevans
        */
        //读写等权限设置
        as->suser = capable(CAP_SYS_ADMIN);
        as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE;
        as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ;

        //将这个用户加入用户队列
        down_write(&user_list_lock);
        list_add(&as->list, &apm_user_list);
        up_write(&user_list_lock);

        //这是一个传递私有数据的一个通用方式
        filp->private_data = as;
    }

    return as ? 0 : -ENOMEM;
}


当用户空间进程去读这个设备时, 这个函数就会被调用.
这个函数的主要作用是将事件读出到用户空间
static ssize_t apm_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
{
    struct apm_user *as = fp->private_data;
    apm_event_t event;
    int i = count, ret = 0;

    if (count < sizeof(apm_event_t))
        return -EINVAL;

    //队列空, 且进程非阻塞读, 立刻返回
    if (queue_empty(&as->queue) && fp->f_flags & O_NONBLOCK)
        return -EAGAIN;

    //否则等待到队列非空为止,
    wait_event_interruptible(apm_waitqueue, !queue_empty(&as->queue));

    //将队列中的事件复制给用户空间
    while ((i >= sizeof(event)) && !queue_empty(&as->queue)) {
        event = queue_get_event(&as->queue);

        ret = -EFAULT;
        if (copy_to_user(buf, &event, sizeof(event)))
            break;

        //设置状态
        if (event == APM_SYS_SUSPEND || event == APM_USER_SUSPEND)
            as->suspend_state = SUSPEND_READ;

        buf += sizeof(event);
        i -= sizeof(event);
    }

    if (i < count)
        ret = count - i;

    return ret;
}

//这个poll/select的后端实现, 用于查询有没有数据可读
static unsigned int apm_poll(struct file *fp, poll_table * wait)
{
    struct apm_user *as = fp->private_data;

    poll_wait(fp, &apm_waitqueue, wait);
    return queue_empty(&as->queue) ? 0 : POLLIN | POLLRDNORM;
}



//这个是这个设备的核心函数, 用于内核与用户空间交互
/*
* apm_ioctl - handle APM ioctl
*
* APM_IOC_SUSPEND
*   This IOCTL is overloaded, and performs two functions. It is used to:
*     - initiate a suspend
*     - acknowledge a suspend read from /dev/apm_bios.
*   Only when everyone who has opened /dev/apm_bios with write permission
*   has acknowledge does the actual suspend happen.
*/
static int
apm_ioctl(struct inode * inode, struct file *filp, u_int cmd, u_long arg)
{
    struct apm_user *as = filp->private_data;
    unsigned long flags;
    int err = -EINVAL;

    if (!as->suser || !as->writer)
        return -EPERM;

    switch (cmd) {
    case APM_IOC_SUSPEND:
        as->suspend_result = -EINTR;

        if (as->suspend_state == SUSPEND_READ) {
            /*
            * If we read a suspend command from /dev/apm_bios,
            * then the corresponding APM_IOC_SUSPEND ioctl is
            * interpreted as an acknowledge.
            */
            as->suspend_state = SUSPEND_ACKED;
            suspends_pending--;
        } else {
            /*
            * Otherwise it is a request to suspend the system.
            * Queue an event for all readers, and expect an
            * acknowledge from all writers who haven't already
            * acknowledged.
            */
            queue_event(APM_USER_SUSPEND, as);
        }

        /*
        * If there are no further acknowledges required, suspend
        * the system.
        */
        if (suspends_pending == 0)
            apm_suspend(); //系统进入suspend状态
       
        //从suspend中返回
        /*
        * Wait for the suspend/resume to complete. If there are
        * pending acknowledges, we wait here for them.
        *
        * Note that we need to ensure that the PM subsystem does
        * not kick us out of the wait when it suspends the threads.
        */
        flags = current->flags;
        current->flags |= PF_NOFREEZE;

        /*
        * Note: do not allow a thread which is acking the suspend
        * to escape until the resume is complete.
        */
        if (as->suspend_state == SUSPEND_ACKED)
            wait_event(apm_suspend_waitqueue,
                    as->suspend_state == SUSPEND_DONE);
        else
            wait_event_interruptible(apm_suspend_waitqueue,
                    as->suspend_state == SUSPEND_DONE);

        current->flags = flags;
        err = as->suspend_result;
        as->suspend_state = SUSPEND_NONE;
        break;
    }

    return err;
}



4. 事件队列函数
static void queue_event(apm_event_t event, struct apm_user *sender)
{
    struct apm_user *as;

    down_read(&user_list_lock);
    //将事件加入其他USER,除自己外
    list_for_each_entry(as, &apm_user_list, list) {
        if (as != sender && as->reader)
            queue_event_one_user(as, event);
    }
    up_read(&user_list_lock);
    //唤醒等待读的进程
    wake_up_interruptible(&apm_waitqueue);
}


static void queue_event_one_user(struct apm_user *as, apm_event_t event)
{
    if (as->suser && as->writer) {
        switch (event) {
        case APM_SYS_SUSPEND:
        case APM_USER_SUSPEND:
            /*
            * If this user already has a suspend pending,
            * don't queue another one.
            */
            if (as->suspend_state != SUSPEND_NONE)
                return;

            as->suspend_state = SUSPEND_PENDING;
            suspends_pending++;
            break;
        }
    }
    queue_add_event(&as->queue, event);
}


static void queue_add_event(struct apm_queue *q, apm_event_t event)
{
    q->event_head = (q->event_head + 1) % APM_MAX_EVENTS;
    if (q->event_head == q->event_tail) { //满了
        static int notified;

        if (notified++ == 0)
            printk(KERN_ERR "apm: an event queue overflowed\n");
        q->event_tail = (q->event_tail + 1) % APM_MAX_EVENTS;
    }
    q->events[q->event_head] = event; 加入队头
}

在来看一个出队的函数:
//在队尾出列
static inline apm_event_t queue_get_event(struct apm_queue *q)
{
    q->event_tail = (q->event_tail + 1) % APM_MAX_EVENTS;
    return q->events[q->event_tail];
}


/*
* APM event queue management.
*/
static inline int queue_empty(struct apm_queue *q)
{
    return q->event_head == q->event_tail;
}

//队列的结构
/*
* Maximum number of events stored
*/
#define APM_MAX_EVENTS        16

struct apm_queue {
    unsigned int        event_head;
    unsigned int        event_tail;
    apm_event_t        events[APM_MAX_EVENTS];
};


5. apm_suspend()
这里才是整个设备想做的事情--将系统转入suspend状态

static void apm_suspend(void)
{
    struct apm_user *as;

    //调用体系无关的接口,将系统转入suspend状态
    int err = pm_suspend(PM_SUSPEND_MEM);


    //从suspend返回
    /*
    * Anyone on the APM queues will think we're still suspended.
    * Send a message so everyone knows we're now awake again.
    */
    //发送一个resume事件
    queue_event(APM_NORMAL_RESUME, NULL);

    /*
    * Finally, wake up anyone who is sleeping on the suspend.
    */
    down_read(&user_list_lock);
    list_for_each_entry(as, &apm_user_list, list) {
        as->suspend_result = err;
        as->suspend_state = SUSPEND_DONE; //suspend完成
    }
    up_read(&user_list_lock);

    //唤醒睡眠进程
    wake_up(&apm_suspend_waitqueue);
}
posted @ 2008-08-22 15:01 puppy 阅读(530) | 评论 (0)编辑 收藏
LINUX I/O资源如何映射到内核虚拟空间

(1) 系统启动初始化时iotable_init()
-----------------------------
MACHINE_START(AT91SAM9261EK, "ATMEL AT91SAM9261")
············································
        .map_io         = at91sam9261_map_io,
············································
MACHINE_END
--------------------------------------
void __init at91sam9261_map_io(void)
{
iotable_init(at91sam9261_io_desc, ARRAY_SIZE(at91sam9261_io_desc));
}
--------------------------------------
/*
* System peripheral registers mapped at virtual address.
*/

static struct map_desc at91sam9261_io_desc[] __initdata = {
{
   .virtual = AT91C_VA_BASE_SYS,
   .pfn = __phys_to_pfn(AT91C_BASE_AIC),
   .length = SZ_4K,
   .type = MT_DEVICE
},
{
   .virtual = AT91C_VA_BASE_EBI,
   .pfn = __phys_to_pfn(AT91C_BASE_EBI),
   .length = SZ_4K,
   .type = MT_DEVICE
},
     ··············································
};

<./linux/include/asm-arm/map.h>-----------------------
struct map_desc {
unsigned long virtual;
unsigned long pfn;
unsigned long length;
unsigned int type;   //标志位:domain、read、write、cache、buffer
};

#define __phys_to_pfn(paddr)             ((paddr) >> PAGE_SHIFT)
#define __pfn_to_phys(pfn)                 ((pfn) << PAGE_SHIFT)
--------------------------------------
   iotable_init()函数<./arch/arm/mm/mm-armv.c>循环调用create_mapping()函数完成IO的虚拟地址到物理地址的映射。


(2) 系统启动后,在驱动中ioremap()
--------------------------------------
static struct platform_device *smdk2410_devices[] __initdata = {
&s3c_device_usb,    //片上的各个设备
&s3c_device_lcd,    //下面以s3c_device_lcd为例
&s3c_device_wdt,
&s3c_device_i2c,
&s3c_device_iis,
};
--------------------------------------
struct platform_device s3c_device_lcd = {
.name = "s3c2410-lcd", //此处设备的命名应和相应驱动程序命名一致以实现driver bind
.id = -1,                        //-1表示不支持同类多个设备
.num_resources = ARRAY_SIZE(s3c_lcd_resource),
.resource = s3c_lcd_resource,
.dev = {
.dma_mask = &s3c_device_lcd_dmamask,
.coherent_dma_mask = 0xffffffffUL
}
};

-------------------------------------
/* LCD Controller */
static struct resource s3c_lcd_resource[] = {   //LCD的两个资源
[0] = {
.start = S3C2410_PA_LCD,
.end = S3C2410_PA_LCD + S3C2410_SZ_LCD,
.flags = IORESOURCE_MEM,
},
[1] = {
.start = IRQ_LCD,
.end = IRQ_LCD,
.flags = IORESOURCE_IRQ,
}

};
------------------------------------
/* -------Resource type -------- */
#define IORESOURCE_IO                0x00000100       
#define IORESOURCE_MEM                0x00000200
#define IORESOURCE_IRQ                0x00000400
#define IORESOURCE_DMA                0x00000800
------------------------------------

-----s3c_device_lcd的resource中硬件地址---------------

#define S3C2410_LCDREG(x) (x)

/* LCD control registers */
#define S3C2410_LCDCON1     S3C2410_LCDREG(0x00)
#define S3C2410_LCDCON2     S3C2410_LCDREG(0x04)
#define S3C2410_LCDCON3     S3C2410_LCDREG(0x08)
#define S3C2410_LCDCON4     S3C2410_LCDREG(0x0C)
#define S3C2410_LCDCON5     S3C2410_LCDREG(0x10)

/* LCD controller */
#define S3C2410_PA_LCD    (0x4D000000)
#define S3C24XX_SZ_LCD    SZ_1M
-----------------------------------
/**
* platform_device_register - add a platform-level device
* @pdev: platform device we're adding
*
*/
int platform_device_register(struct platform_device * pdev)
{
device_initialize(&pdev->dev);        //初始化设备结构
return platform_device_add(pdev); //添加一个片上的设备到设备层
}
------------------------------------------
/**
* platform_device_add - add a platform device to device hierarchy
* @pdev: platform device we're adding
*
* This is part 2 of platform_device_register(), though may be called
* separately _iff_ pdev was allocated by platform_device_alloc().
*/
int platform_device_add(struct platform_device *pdev)
{
int i, ret = 0;
if (!pdev)
   return -EINVAL;

if (!pdev->dev.parent)
   pdev->dev.parent = &platform_bus;
pdev->dev.bus = &platform_bus_type;

if (pdev->id != -1)
   snprintf(pdev->dev.bus_id, BUS_ID_SIZE, "%s.%u", pdev->name, pdev->id);
                 /* 若支持同类多个设备,则用pdev->name和pdev->id在总线上标识该设备 */
else
   strlcpy(pdev->dev.bus_id, pdev->name, BUS_ID_SIZE);
                /*               否则,用pdev->name(即"s3c2410-lcd")在总线上标识该设备               */

for (i = 0; i < pdev->num_resources; i++) {
                /*           遍历资源数,并为各自在总线地址空间请求分配            */
   struct resource *p, *r = &pdev->resource[i];

   if (r->name == NULL)
    r->name = pdev->dev.bus_id;

   p = r->parent;
   if (!p) {
    if (r->flags & IORESOURCE_MEM)
      p = &iomem_resource;
                                          /*    LCD寄存器地址作为IO内存资源分配   */
                                                      ----------------
                                                      struct resource iomem_resource = {
                                                              .name   = "PCI mem",
                                                              .start = 0UL,
                                                              .end    = ~0UL,
                                                              .flags = IORESOURCE_MEM,
                                                               };
                                                      ----------------
    else if (r->flags & IORESOURCE_IO)
      p = &ioport_resource;
   }
                      
   if (p && insert_resource(p, r)) {
                              /*   将LCD寄存器地址插入到IO内存空间 */
    printk(KERN_ERR
          "%s: failed to claim resource %d\n",
          pdev->dev.bus_id, i);
    ret = -EBUSY;
    goto failed;
   }
}

pr_debug("Registering platform device '%s'. Parent at %s\n",
   pdev->dev.bus_id, pdev->dev.parent->bus_id);

ret = device_add(&pdev->dev);
if (ret == 0)
   return ret;

failed:
while (--i >= 0)
   if (pdev->resource[i].flags & (IORESOURCE_MEM|IORESOURCE_IO))
    release_resource(&pdev->resource[i]);
return ret;
}
-----------------------------------------


static struct platform_driver s3c2410fb_driver = {
.probe   = s3c2410fb_probe,
.remove   = s3c2410fb_remove,
.suspend = s3c2410fb_suspend,
.resume   = s3c2410fb_resume,
.driver   = {
   .name = "s3c2410-lcd",
   .owner = THIS_MODULE,
},
};

platform_driver_register(&s3c2410fb_driver)----->
driver_register(&drv->driver)----->
bus_add_driver(drv)----->
driver_attach(drv)----->
bus_for_each_dev(drv->bus, NULL, drv, __driver_attach)----->
__driver_attach(struct device * dev, void * data)----->
driver_probe_device(drv, dev)----->
really_probe(dev, drv)----->

在really_probe()中:
            为设备指派管理该设备的驱动:dev->driver = drv
            调用s3c2410fb_probe()初始化设备:drv->probe(dev)

---------------------------------
static int __init s3c2410fb_probe(struct platform_device *pdev)
{
·····························
                res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
                /*          取得LCD控制寄存器的物理地址           */

size = (res->end - res->start)+1;
info->mem = request_mem_region(res->start, size, pdev->name);
               /* 个人理解:设备注册时已经分配区域,驱动这里应该不是必须的*/

info->io = ioremap(res->start, size);
               /*      此时驱动便可以用指针info->io 读写LCD控制寄存器了    */
               /*              eg: readl(info->io + S3C2410_LCDCON1)              */
····························
}

--------------------------------
以下是AT91SAM9261EK的IOMEM:

root@ebd9261:~# cat /proc/iomem
00500000-005fffff : usb-ohci.0
00500000-005fffff : ohci_hcd
00600000-00600fff : sidsa-lcdc.0     //支持同类多个设备,在驱动中未分配I/O内存区域
20000000-23ffffff : System RAM
20022000-20225e47 : Kernel text
20226000-2028da23 : Kernel data
30000000-30000003 : dm9000.0   
30000000-30000003 : dm9000   
30000044-300000ff : dm9000.0
30000044-300000ff : dm9000
fffa4000-fffa7fff : at91_udc            //不支持同类多个设备,在驱动中也分配I/O内存区域
fffa4000-fffa7fff : at91_udc
fffb0000-fffb3fff : usart.1
fffb4000-fffb7fff : usart.2
fffc8000-fffcbfff : spi.0
fffff200-fffff3ff : usart.0

posted @ 2008-08-22 14:44 puppy 阅读(448) | 评论 (1)编辑 收藏
__setup 在内核中的作用
jeppeter


你的这个问题,我从google上查找到了一些资料,再结合内核源代码,就在这里把这个问题说的清楚一点.
首先,这里有一个简短的回答,

*****************************************************************
* David Wuertele (dave-gnus@bfnet.com) wrote:
> I'm trying to track down why ide_setup() doesn't seem to be called on
> my system.  I'm using KGDB to put breakpoints in ide_setup(), but they
> never get hit.  I see that ide_setup is defined as:
>
> int __init ide_setup (char *s)
> {
> ...
>
> and there is also a line:
>
> __setup("", ide_setup);
>
> What do these macros do?  How can I get KGDB to break in ide_setup()?

take a look at the source (include/linux/init.h).  it places code in a
special section, then is used during init/main.c::checksetup() to find
matching setup strings from the kernel boot commandline.  it's not going
to be called if you don't have an "idex=xxx" or "hdx=" string on the
commandline.  it's also called if you compile ide modular and give it
options during insmod.

cheers,
-chris

*******************************************************************************

从这上面的意思是这里会从main.c 中的checksetup函数中运行,这个函数是这样的

static int __init checksetup(char *line)
{
struct kernel_param *p;

p = &__setup_start;
do {
int n = strlen(p->str);
if (!strncmp(line,p->str,n)) {
if (p->setup_func(line+n))
return 1;
}
p++;
} while (p < &__setup_end);
return 0;
}



这里的意思是从__setup_start开始处到__setup_end处中查找一个数据结构,这个数据结构中有str与setup_func这两个数据成员变量.
只要与这里面的str与输入的参数字符串相匹配,就会调用个这个字符串后面所指的内容,
对于你这里所说的 __setup("console=",console_setup); 就是你在启动linux内核的时候如果有这么一个参数输入console=ttyS1,那内核就会
把默认的tty定位为ttyS1,这个在consol_setup函数的字符串处理中完成,因为它最后是确定prefered_console的参数.


那把这在这里实现这个的内容是这样的,

__setup() 是一个宏定义,在include/linux/init.h这个文件中.
struct kernel_param {
const char *str;
int (*setup_func)(char *);
};

extern struct kernel_param __setup_start, __setup_end;

#define __setup(str, fn) \
static char __setup_str_##fn[] __initdata = str; \
static struct kernel_param __setup_##fn __attribute__((unused)) __initsetup = { __setup_str_##fn, fn }

在这个情景中作了替换是这样的

static char __setup_str_console_setup[] = "console=";
static struct kernel_param __setup_console_setup = { __setup_str_console_setup, console_setup}



这样你还可能不是很清楚,那你就要参考arch/i386/vmlinuz.lds这个关于ld 链接器的脚本文件有这样的一段

__setup_start = .;
.setup.init : { *(.setup.init) }
__setup_end = .;


这里的意思就是__setup_start是一个节的开始,而__setup_end是一个节的结束,这个节的名称是.setup,init,
这个你可以用readelf -a这个来看一下你的vmlinux-2.4.20-8(后面的数字与你的内核版本有关)这个文件,
可以看到有一个叫.setup.init的节,__setup_start就是指这个节的开始,那这个节中有什么内容呢,其实就是一个
数据结构,一个就是str,一个就是setup_func,与我前面的说法相一致,那具体是什么呢,就是一个在.init.data节中存储的
字符串-----__initdata是一个宏,就是(__attribute__ ((__section__ (".data.init")))), 所以你可以.data.init在vmlinux-2.4.20-8中的
在文件中的偏移量与加载的的虚拟地址偏移量相减就可以得到,
举个例子,所有的这些都是用readelf 与od 命令得到的
我现在用的内核版本,它的.setup.init的节在0x26dd60的文件偏移处.
[10] .data.init PROGBITS c0368040 268040 005d18 00 WA 0 0 32
[11] .setup.init PROGBITS c036dd60 26dd60 0001b0 00 WA 0 0 4

再查找console_setup在vmlinux-2.4.20-8所被映射为内存地址,
840: c0355d40 343 FUNC LOCAL DEFAULT 9 console_setup

这就可以知道了它所在的位置,就是0xc0355d40,这就是它的虚拟映射地址

再用下面一条命令
od --address-radix=x -t x4 vmlinux-2.4.20-8 |grep -A 20 26dd60 |head -20 | grep c0355d40
可以得到
26de40 c036943b c0355d10 c0369447 c0355d40

很明显,这个函数的处理字符串在内存中的地址是0xc0369447,与前面得到的.data.init节在内存映射中的位置
0xc0368040相减就是 0x1407,与.data.init在文件中的偏移量0x268040相加就得到0x269447
这样用
od --address-radix=x -a vmlinux-2.4.20-8 |grep -A 2 269440

就可以得到下面的内容,
269440 b l i n k = nul c o n s o l e = nul
269450 r e s e r v e = nul nul nul nul nul nul nul nul
269460 ` dc4 6 @ ` dc4 6 @ c p u f r e q =

"console="这个值果真就在这里.

(注:前面od 的选项 --address-radix= 表示的是显示文件偏移量的格式,默认下是o就是八进制, -t 表示显示文件二进制的形式
默认是o6 就是八进制的6位长,而-a表示显示的是字符串格式.)
posted @ 2008-08-22 14:38 puppy 阅读(447) | 评论 (0)编辑 收藏
仅列出标题
共5页: 1 2 3 4 5